{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.999086201644837, "eval_steps": 500, "global_step": 4100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 92.953125, "completions/mean_terminated_length": 92.953125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0012183978068839476, "grad_norm": 3.4737374782562256, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0788, "num_tokens": 12693.0, "reward": -13.051025390625, "reward_std": 1.8193588256835938, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -26.42236328125, "rewards/ppl_reward/std": 20.007047653198242, "rewards/tag_count_reward/mean": 0.14453125, "rewards/tag_count_reward/std": 0.2738782465457916, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 80.328125, "completions/mean_terminated_length": 80.328125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0024367956137678953, "grad_norm": 4.619309902191162, "kl": 0.0, "learning_rate": 4.8780487804878054e-08, "loss": -0.0776, "num_tokens": 24426.0, "reward": -16.17578125, "reward_std": 0.1257670819759369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -32.4609375, "rewards/ppl_reward/std": 58.27933883666992, "rewards/tag_count_reward/mean": 0.0546875, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 85.28125, "completions/mean_terminated_length": 85.28125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.003655193420651843, "grad_norm": 3.689451217651367, "kl": 0.0008955001831054688, "learning_rate": 9.756097560975611e-08, "loss": -0.1217, "num_tokens": 37308.0, "reward": -5.1650390625, "reward_std": 0.354502409696579, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -10.509765625, "rewards/ppl_reward/std": 4.588176727294922, "rewards/tag_count_reward/mean": 0.08984375, "rewards/tag_count_reward/std": 0.23302355408668518, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 92.875, "completions/mean_terminated_length": 92.875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.004873591227535791, "grad_norm": 4.227373123168945, "kl": 0.0009326934814453125, "learning_rate": 1.4634146341463415e-07, "loss": -0.1196, "num_tokens": 50676.0, "reward": -5.310546875, "reward_std": 0.2866164445877075, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -10.90234375, "rewards/ppl_reward/std": 4.9163689613342285, "rewards/tag_count_reward/mean": 0.140625, "rewards/tag_count_reward/std": 0.2847827076911926, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 81.671875, "completions/mean_terminated_length": 81.671875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.006091989034419738, "grad_norm": 3.950299024581909, "kl": 0.0009775161743164062, "learning_rate": 1.9512195121951221e-07, "loss": -0.0305, "num_tokens": 62383.0, "reward": -10.8575439453125, "reward_std": 0.24512410163879395, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -21.949462890625, "rewards/ppl_reward/std": 21.104360580444336, "rewards/tag_count_reward/mean": 0.1171875, "rewards/tag_count_reward/std": 0.255761981010437, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 76.515625, "completions/mean_terminated_length": 76.515625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.007310386841303686, "grad_norm": 3.640664577484131, "kl": 0.0009918212890625, "learning_rate": 2.439024390243903e-07, "loss": -0.0345, "num_tokens": 73912.0, "reward": -6.015869140625, "reward_std": 0.9091383218765259, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -12.28955078125, "rewards/ppl_reward/std": 8.54690170288086, "rewards/tag_count_reward/mean": 0.12890625, "rewards/tag_count_reward/std": 0.2597014009952545, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 96.609375, "completions/mean_terminated_length": 96.609375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.008528784648187633, "grad_norm": 3.89339017868042, "kl": 0.000904083251953125, "learning_rate": 2.926829268292683e-07, "loss": -0.0588, "num_tokens": 86855.0, "reward": -5.9375, "reward_std": 0.19046564400196075, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -12.0625, "rewards/ppl_reward/std": 6.548203468322754, "rewards/tag_count_reward/mean": 0.09375, "rewards/tag_count_reward/std": 0.22493386268615723, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 85.109375, "completions/mean_terminated_length": 85.109375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.009747182455071581, "grad_norm": 3.8493499755859375, "kl": 0.001018524169921875, "learning_rate": 3.414634146341464e-07, "loss": -0.0727, "num_tokens": 99638.0, "reward": -4.97119140625, "reward_std": 0.2910235524177551, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -10.1845703125, "rewards/ppl_reward/std": 5.740010738372803, "rewards/tag_count_reward/mean": 0.12109375, "rewards/tag_count_reward/std": 0.2558528780937195, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 83.59375, "completions/mean_terminated_length": 83.59375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.01096558026195553, "grad_norm": 4.098174571990967, "kl": 0.0010280609130859375, "learning_rate": 3.9024390243902443e-07, "loss": -0.1211, "num_tokens": 112156.0, "reward": -8.145751953125, "reward_std": 0.24081315100193024, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -16.49462890625, "rewards/ppl_reward/std": 22.282073974609375, "rewards/tag_count_reward/mean": 0.1015625, "rewards/tag_count_reward/std": 0.24688033759593964, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 88.9375, "completions/mean_terminated_length": 88.9375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.012183978068839476, "grad_norm": 4.002153396606445, "kl": 0.0009546279907226562, "learning_rate": 4.3902439024390246e-07, "loss": -0.1323, "num_tokens": 125632.0, "reward": -22.091796875, "reward_std": 0.19503436982631683, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -44.46484375, "rewards/ppl_reward/std": 61.6456413269043, "rewards/tag_count_reward/mean": 0.140625, "rewards/tag_count_reward/std": 0.2777281701564789, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 101.734375, "completions/mean_terminated_length": 101.734375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.013402375875723424, "grad_norm": 4.0138349533081055, "kl": 0.0009546279907226562, "learning_rate": 4.878048780487805e-07, "loss": -0.0581, "num_tokens": 139263.0, "reward": -5.433837890625, "reward_std": 0.4000650644302368, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -11.07080078125, "rewards/ppl_reward/std": 4.511773109436035, "rewards/tag_count_reward/mean": 0.1015625, "rewards/tag_count_reward/std": 0.2345155030488968, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 88.359375, "completions/mean_terminated_length": 88.359375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.014620773682607372, "grad_norm": 3.9165263175964355, "kl": 0.00153350830078125, "learning_rate": 5.365853658536586e-07, "loss": -0.0721, "num_tokens": 151550.0, "reward": -9.048095703125, "reward_std": 0.721196711063385, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -18.42431640625, "rewards/ppl_reward/std": 9.703301429748535, "rewards/tag_count_reward/mean": 0.1640625, "rewards/tag_count_reward/std": 0.2824873626232147, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 77.15625, "completions/mean_terminated_length": 77.15625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.015839171489491318, "grad_norm": 4.571539878845215, "kl": 0.00272369384765625, "learning_rate": 5.853658536585366e-07, "loss": -0.0481, "num_tokens": 163424.0, "reward": -5.80859375, "reward_std": 0.2287602424621582, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -11.9296875, "rewards/ppl_reward/std": 5.040430545806885, "rewards/tag_count_reward/mean": 0.15625, "rewards/tag_count_reward/std": 0.29378482699394226, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 83.5625, "completions/mean_terminated_length": 83.5625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.017057569296375266, "grad_norm": 5.99103307723999, "kl": 0.00244903564453125, "learning_rate": 6.341463414634146e-07, "loss": -0.2157, "num_tokens": 175124.0, "reward": -11.65869140625, "reward_std": 0.40626195073127747, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -23.8408203125, "rewards/ppl_reward/std": 28.19257164001465, "rewards/tag_count_reward/mean": 0.26171875, "rewards/tag_count_reward/std": 0.3433890640735626, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 101.421875, "completions/mean_terminated_length": 101.421875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.018275967103259214, "grad_norm": 4.324979305267334, "kl": 0.003261566162109375, "learning_rate": 6.829268292682928e-07, "loss": -0.1476, "num_tokens": 189239.0, "reward": -12.95849609375, "reward_std": 0.5275493264198303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -26.1748046875, "rewards/ppl_reward/std": 36.72951126098633, "rewards/tag_count_reward/mean": 0.12890625, "rewards/tag_count_reward/std": 0.2314215451478958, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 121.109375, "completions/mean_terminated_length": 121.109375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.019494364910143162, "grad_norm": 3.4768612384796143, "kl": 0.005199432373046875, "learning_rate": 7.317073170731707e-07, "loss": 0.0073, "num_tokens": 204238.0, "reward": -10.217041015625, "reward_std": 0.411332905292511, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -21.06689453125, "rewards/ppl_reward/std": 18.320594787597656, "rewards/tag_count_reward/mean": 0.31640625, "rewards/tag_count_reward/std": 0.3485861122608185, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 110.984375, "completions/mean_terminated_length": 110.984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.02071276271702711, "grad_norm": 3.256314992904663, "kl": 0.0117340087890625, "learning_rate": 7.804878048780489e-07, "loss": -0.0267, "num_tokens": 218821.0, "reward": -10.857177734375, "reward_std": 0.5259931087493896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -22.33935546875, "rewards/ppl_reward/std": 18.62110710144043, "rewards/tag_count_reward/mean": 0.3125, "rewards/tag_count_reward/std": 0.3362963795661926, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 107.171875, "completions/mean_terminated_length": 107.171875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.02193116052391106, "grad_norm": 3.920504331588745, "kl": 0.030548095703125, "learning_rate": 8.292682926829269e-07, "loss": -0.1494, "num_tokens": 232680.0, "reward": -14.841064453125, "reward_std": 1.2328786849975586, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -30.51025390625, "rewards/ppl_reward/std": 28.965320587158203, "rewards/tag_count_reward/mean": 0.4140625, "rewards/tag_count_reward/std": 0.34277912974357605, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 84.546875, "completions/mean_terminated_length": 84.546875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.023149558330795003, "grad_norm": 3.871021270751953, "kl": 0.025482177734375, "learning_rate": 8.780487804878049e-07, "loss": -0.1377, "num_tokens": 244835.0, "reward": -6.949462890625, "reward_std": 0.40485116839408875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -14.58642578125, "rewards/ppl_reward/std": 9.252861022949219, "rewards/tag_count_reward/mean": 0.34375, "rewards/tag_count_reward/std": 0.3133915960788727, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 109.03125, "completions/mean_terminated_length": 109.03125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.02436795613767895, "grad_norm": 3.5464255809783936, "kl": 0.0250244140625, "learning_rate": 9.26829268292683e-07, "loss": 0.0533, "num_tokens": 258829.0, "reward": -6.320068359375, "reward_std": 1.0719813108444214, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -13.69482421875, "rewards/ppl_reward/std": 10.570899963378906, "rewards/tag_count_reward/mean": 0.52734375, "rewards/tag_count_reward/std": 0.3121774196624756, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 122.84375, "completions/mean_terminated_length": 122.84375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0255863539445629, "grad_norm": 3.4255900382995605, "kl": 0.02874755859375, "learning_rate": 9.75609756097561e-07, "loss": -0.0096, "num_tokens": 273971.0, "reward": -4.265625, "reward_std": 0.5106558203697205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.578125, "rewards/ppl_reward/std": 4.566649913787842, "rewards/tag_count_reward/mean": 0.5234375, "rewards/tag_count_reward/std": 0.32647332549095154, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 91.765625, "completions/mean_terminated_length": 91.765625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.026804751751446847, "grad_norm": 4.456026554107666, "kl": 0.03363037109375, "learning_rate": 1.024390243902439e-06, "loss": -0.1199, "num_tokens": 286788.0, "reward": -5.0250244140625, "reward_std": 0.21363583207130432, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -11.292236328125, "rewards/ppl_reward/std": 8.12093448638916, "rewards/tag_count_reward/mean": 0.62109375, "rewards/tag_count_reward/std": 0.2314215451478958, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 106.09375, "completions/mean_terminated_length": 106.09375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.028023149558330795, "grad_norm": 4.438456058502197, "kl": 0.0367431640625, "learning_rate": 1.0731707317073172e-06, "loss": 0.0618, "num_tokens": 300218.0, "reward": -4.7611083984375, "reward_std": 0.5988525152206421, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -10.850341796875, "rewards/ppl_reward/std": 3.331571578979492, "rewards/tag_count_reward/mean": 0.6484375, "rewards/tag_count_reward/std": 0.2302463799715042, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 105.1875, "completions/mean_terminated_length": 105.1875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.029241547365214744, "grad_norm": 4.405534267425537, "kl": 0.039520263671875, "learning_rate": 1.1219512195121953e-06, "loss": -0.0344, "num_tokens": 314070.0, "reward": -4.82421875, "reward_std": 0.8206428289413452, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -10.96875, "rewards/ppl_reward/std": 8.805326461791992, "rewards/tag_count_reward/mean": 0.62890625, "rewards/tag_count_reward/std": 0.2519455552101135, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 82.671875, "completions/mean_terminated_length": 82.671875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.03045994517209869, "grad_norm": 4.528026103973389, "kl": 0.07525634765625, "learning_rate": 1.1707317073170732e-06, "loss": -0.0296, "num_tokens": 326009.0, "reward": -13.2618408203125, "reward_std": 2.3880362510681152, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -27.992431640625, "rewards/ppl_reward/std": 31.833303451538086, "rewards/tag_count_reward/mean": 0.703125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 90.4375, "completions/mean_terminated_length": 90.4375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.031678342978982636, "grad_norm": 6.640916347503662, "kl": 0.04522705078125, "learning_rate": 1.2195121951219514e-06, "loss": -0.0136, "num_tokens": 338245.0, "reward": -6.8173828125, "reward_std": 2.011258363723755, "rewards/format_reward/mean": 0.15625, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -15.439453125, "rewards/ppl_reward/std": 12.542396545410156, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.1510031819343567, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 121.609375, "completions/mean_terminated_length": 121.609375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.03289674078586659, "grad_norm": 5.470410346984863, "kl": 0.060791015625, "learning_rate": 1.2682926829268293e-06, "loss": 0.0156, "num_tokens": 353228.0, "reward": -5.271728515625, "reward_std": 2.13297700881958, "rewards/format_reward/mean": 0.296875, "rewards/format_reward/std": 0.4604927599430084, "rewards/ppl_reward/mean": -12.66845703125, "rewards/ppl_reward/std": 10.11008358001709, "rewards/tag_count_reward/mean": 0.765625, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 96.5, "completions/mean_terminated_length": 96.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.03411513859275053, "grad_norm": 5.224930286407471, "kl": 0.0928955078125, "learning_rate": 1.3170731707317074e-06, "loss": -0.084, "num_tokens": 366140.0, "reward": -1.44384765625, "reward_std": 1.2933458089828491, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.49776285886764526, "rewards/ppl_reward/mean": -5.3955078125, "rewards/ppl_reward/std": 3.053257465362549, "rewards/tag_count_reward/mean": 0.83203125, "rewards/tag_count_reward/std": 0.14124368131160736, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 120.734375, "completions/mean_terminated_length": 120.734375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.035333536399634484, "grad_norm": 3.0019097328186035, "kl": 0.06805419921875, "learning_rate": 1.3658536585365856e-06, "loss": 0.0087, "num_tokens": 381251.0, "reward": -4.3121337890625, "reward_std": 1.353281021118164, "rewards/format_reward/mean": 0.484375, "rewards/format_reward/std": 0.5037065148353577, "rewards/ppl_reward/mean": -11.335205078125, "rewards/ppl_reward/std": 6.842125415802002, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.1408040076494217, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 105.9375, "completions/mean_terminated_length": 105.9375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.03655193420651843, "grad_norm": 4.17593240737915, "kl": 0.0908203125, "learning_rate": 1.4146341463414633e-06, "loss": 0.1336, "num_tokens": 394671.0, "reward": -1.2620849609375, "reward_std": 0.5723294019699097, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4787135720252991, "rewards/ppl_reward/mean": -5.578857421875, "rewards/ppl_reward/std": 2.4808924198150635, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.15425311028957367, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 109.25, "completions/mean_terminated_length": 109.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.03777033201340237, "grad_norm": 4.140424728393555, "kl": 0.1260986328125, "learning_rate": 1.4634146341463414e-06, "loss": 0.038, "num_tokens": 408439.0, "reward": -2.8597412109375, "reward_std": 0.7318984270095825, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -8.922607421875, "rewards/ppl_reward/std": 5.9151201248168945, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.1406387835741043, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 109.96875, "completions/mean_terminated_length": 109.96875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.038988729820286325, "grad_norm": 4.069164752960205, "kl": 0.11865234375, "learning_rate": 1.5121951219512196e-06, "loss": -0.0525, "num_tokens": 422829.0, "reward": -2.979248046875, "reward_std": 0.7594366669654846, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -9.58349609375, "rewards/ppl_reward/std": 10.11573314666748, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 102.078125, "completions/mean_terminated_length": 102.078125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.04020712762717027, "grad_norm": 3.925772190093994, "kl": 0.1297607421875, "learning_rate": 1.5609756097560977e-06, "loss": 0.0868, "num_tokens": 435442.0, "reward": -2.3475341796875, "reward_std": 1.2349737882614136, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.335693359375, "rewards/ppl_reward/std": 5.7453131675720215, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 109.1875, "completions/mean_terminated_length": 109.1875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.04142552543405422, "grad_norm": 3.3029255867004395, "kl": 0.1024169921875, "learning_rate": 1.6097560975609759e-06, "loss": 0.141, "num_tokens": 449622.0, "reward": -2.5059814453125, "reward_std": 0.9135293960571289, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.793212890625, "rewards/ppl_reward/std": 4.3098320960998535, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 95.65625, "completions/mean_terminated_length": 95.65625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.042643923240938165, "grad_norm": 3.393984079360962, "kl": 0.129638671875, "learning_rate": 1.6585365853658538e-06, "loss": -0.0202, "num_tokens": 462496.0, "reward": -1.8243408203125, "reward_std": 0.44873401522636414, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.609619140625, "rewards/ppl_reward/std": 3.9374799728393555, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 118.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.04386232104782212, "grad_norm": 3.344320058822632, "kl": 0.1104736328125, "learning_rate": 1.707317073170732e-06, "loss": 0.1211, "num_tokens": 477496.0, "reward": -1.2127685546875, "reward_std": 0.5656348466873169, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.238037109375, "rewards/ppl_reward/std": 2.3442935943603516, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.23517554998397827, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 111.65625, "completions/mean_terminated_length": 111.65625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.04508071885470606, "grad_norm": 3.472776174545288, "kl": 0.1033935546875, "learning_rate": 1.7560975609756098e-06, "loss": 0.0798, "num_tokens": 492282.0, "reward": -2.15625, "reward_std": 0.8477602601051331, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.234375, "rewards/ppl_reward/std": 4.937544822692871, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16194961965084076, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 104.75, "completions/mean_terminated_length": 104.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.046299116661590006, "grad_norm": 3.187865972518921, "kl": 0.13623046875, "learning_rate": 1.804878048780488e-06, "loss": 0.0552, "num_tokens": 506050.0, "reward": -2.879150390625, "reward_std": 0.7319018244743347, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -9.67236328125, "rewards/ppl_reward/std": 8.581618309020996, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 92.796875, "completions/mean_terminated_length": 92.796875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.04751751446847396, "grad_norm": 3.3878250122070312, "kl": 0.111328125, "learning_rate": 1.853658536585366e-06, "loss": 0.0594, "num_tokens": 518485.0, "reward": -2.111328125, "reward_std": 1.3406872749328613, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.17578125, "rewards/ppl_reward/std": 8.109599113464355, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 94.65625, "completions/mean_terminated_length": 94.65625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.0487359122753579, "grad_norm": 3.177593469619751, "kl": 0.10986328125, "learning_rate": 1.902439024390244e-06, "loss": -0.0451, "num_tokens": 531079.0, "reward": -1.1805419921875, "reward_std": 0.3179858326911926, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.361083984375, "rewards/ppl_reward/std": 4.740221977233887, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 87.09375, "completions/mean_terminated_length": 87.09375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.049954310082241854, "grad_norm": 3.244277238845825, "kl": 0.117431640625, "learning_rate": 1.951219512195122e-06, "loss": -0.0288, "num_tokens": 543253.0, "reward": -4.103271484375, "reward_std": 1.0130198001861572, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -12.13623046875, "rewards/ppl_reward/std": 8.411355972290039, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 103.25, "completions/mean_terminated_length": 103.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.0511727078891258, "grad_norm": 3.2600505352020264, "kl": 0.1175537109375, "learning_rate": 2.0000000000000003e-06, "loss": 0.168, "num_tokens": 557509.0, "reward": -1.009521484375, "reward_std": 0.3457116484642029, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.97998046875, "rewards/ppl_reward/std": 4.4045305252075195, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 85.78125, "completions/mean_terminated_length": 85.78125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.05239110569600975, "grad_norm": 3.1778993606567383, "kl": 0.13671875, "learning_rate": 2.048780487804878e-06, "loss": 0.0371, "num_tokens": 570007.0, "reward": -1.4144287109375, "reward_std": 0.19495847821235657, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.828857421875, "rewards/ppl_reward/std": 5.093136787414551, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 88.9375, "completions/mean_terminated_length": 88.9375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.053609503502893695, "grad_norm": 3.307697296142578, "kl": 0.114013671875, "learning_rate": 2.097560975609756e-06, "loss": 0.0659, "num_tokens": 582107.0, "reward": -1.7003173828125, "reward_std": 1.8042172193527222, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.400634765625, "rewards/ppl_reward/std": 8.66934871673584, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 102.390625, "completions/mean_terminated_length": 102.390625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.05482790130977764, "grad_norm": 2.819756031036377, "kl": 0.1092529296875, "learning_rate": 2.1463414634146343e-06, "loss": -0.0012, "num_tokens": 596300.0, "reward": -2.31396484375, "reward_std": 0.5677937269210815, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -8.5419921875, "rewards/ppl_reward/std": 6.756743431091309, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 77.0, "completions/mean_terminated_length": 77.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.05604629911666159, "grad_norm": 3.324772834777832, "kl": 0.1600341796875, "learning_rate": 2.1951219512195125e-06, "loss": 0.0329, "num_tokens": 607644.0, "reward": -4.0321044921875, "reward_std": 1.2146589756011963, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -12.064208984375, "rewards/ppl_reward/std": 12.467949867248535, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 96.9375, "completions/mean_terminated_length": 96.9375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.057264696923545536, "grad_norm": 2.6960787773132324, "kl": 0.1259765625, "learning_rate": 2.2439024390243906e-06, "loss": 0.0027, "num_tokens": 621416.0, "reward": -0.907958984375, "reward_std": 0.2834787666797638, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.81591796875, "rewards/ppl_reward/std": 2.2348310947418213, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 76.125, "completions/mean_terminated_length": 76.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.05848309473042949, "grad_norm": 3.684286594390869, "kl": 0.1676025390625, "learning_rate": 2.2926829268292687e-06, "loss": 0.0591, "num_tokens": 632632.0, "reward": -5.2244873046875, "reward_std": 1.289305329322815, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -14.370849609375, "rewards/ppl_reward/std": 19.40111541748047, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 85.5625, "completions/mean_terminated_length": 85.5625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.05970149253731343, "grad_norm": 2.972324848175049, "kl": 0.14501953125, "learning_rate": 2.3414634146341465e-06, "loss": 0.0058, "num_tokens": 645148.0, "reward": -3.009765625, "reward_std": 1.0179588794708252, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -9.97265625, "rewards/ppl_reward/std": 3.489499092102051, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 78.296875, "completions/mean_terminated_length": 78.296875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.06091989034419738, "grad_norm": 2.9574732780456543, "kl": 0.160400390625, "learning_rate": 2.3902439024390246e-06, "loss": -0.0239, "num_tokens": 656927.0, "reward": -7.6153564453125, "reward_std": 0.7507431507110596, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -19.230712890625, "rewards/ppl_reward/std": 34.408077239990234, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 87.234375, "completions/mean_terminated_length": 87.234375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.06213828815108133, "grad_norm": 3.308506488800049, "kl": 0.17138671875, "learning_rate": 2.4390243902439027e-06, "loss": 0.027, "num_tokens": 670270.0, "reward": -1.6123046875, "reward_std": 0.3886762857437134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.224609375, "rewards/ppl_reward/std": 2.3409996032714844, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 78.96875, "completions/mean_terminated_length": 78.96875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.06335668595796527, "grad_norm": 3.607879638671875, "kl": 0.220703125, "learning_rate": 2.487804878048781e-06, "loss": -0.0531, "num_tokens": 682780.0, "reward": -2.0068359375, "reward_std": 0.4412074685096741, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.013671875, "rewards/ppl_reward/std": 4.88762092590332, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 82.953125, "completions/mean_terminated_length": 82.953125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.06457508376484922, "grad_norm": 3.228184223175049, "kl": 0.19970703125, "learning_rate": 2.5365853658536586e-06, "loss": -0.0503, "num_tokens": 695313.0, "reward": -0.69873046875, "reward_std": 0.19971606135368347, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.3583984375, "rewards/ppl_reward/std": 1.9979099035263062, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 76.546875, "completions/mean_terminated_length": 76.546875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.06579348157173318, "grad_norm": 3.443490982055664, "kl": 0.20703125, "learning_rate": 2.5853658536585367e-06, "loss": -0.049, "num_tokens": 707220.0, "reward": -3.6649169921875, "reward_std": 0.31633180379867554, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -11.329833984375, "rewards/ppl_reward/std": 9.63594913482666, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 80.78125, "completions/mean_terminated_length": 80.78125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.06701187937861712, "grad_norm": 3.3308053016662598, "kl": 0.207763671875, "learning_rate": 2.634146341463415e-06, "loss": -0.0035, "num_tokens": 719662.0, "reward": -3.44403076171875, "reward_std": 0.3061312139034271, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -10.8880615234375, "rewards/ppl_reward/std": 11.995141983032227, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 76.796875, "completions/mean_terminated_length": 76.796875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.06823027718550106, "grad_norm": 3.271350622177124, "kl": 0.23876953125, "learning_rate": 2.682926829268293e-06, "loss": -0.0334, "num_tokens": 731609.0, "reward": -2.39453125, "reward_std": 0.28845110535621643, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.7890625, "rewards/ppl_reward/std": 5.671557903289795, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 82.359375, "completions/mean_terminated_length": 82.359375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.06944867499238501, "grad_norm": 3.417114496231079, "kl": 0.24658203125, "learning_rate": 2.731707317073171e-06, "loss": 0.0337, "num_tokens": 744224.0, "reward": -3.38818359375, "reward_std": 0.49387025833129883, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -10.7373046875, "rewards/ppl_reward/std": 5.509284496307373, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 74.484375, "completions/mean_terminated_length": 74.484375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.07066707279926897, "grad_norm": 3.1568689346313477, "kl": 0.255859375, "learning_rate": 2.7804878048780493e-06, "loss": -0.0529, "num_tokens": 755655.0, "reward": -1.134765625, "reward_std": 0.3005996346473694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.26953125, "rewards/ppl_reward/std": 1.318703532218933, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 81.234375, "completions/mean_terminated_length": 81.234375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.07188547060615291, "grad_norm": 3.548064947128296, "kl": 0.27490234375, "learning_rate": 2.8292682926829266e-06, "loss": 0.0149, "num_tokens": 767606.0, "reward": -2.0526123046875, "reward_std": 0.6020480394363403, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -8.058349609375, "rewards/ppl_reward/std": 9.351602554321289, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 94.71875, "completions/mean_terminated_length": 94.71875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.07310386841303686, "grad_norm": 2.994682788848877, "kl": 0.218017578125, "learning_rate": 2.8780487804878047e-06, "loss": -0.1766, "num_tokens": 781348.0, "reward": -0.75927734375, "reward_std": 0.2808852195739746, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.5185546875, "rewards/ppl_reward/std": 2.7967453002929688, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 103.328125, "completions/mean_terminated_length": 103.328125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0743222662199208, "grad_norm": 2.961920738220215, "kl": 0.237060546875, "learning_rate": 2.926829268292683e-06, "loss": 0.1651, "num_tokens": 794769.0, "reward": -2.44384765625, "reward_std": 0.8536393642425537, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -8.8408203125, "rewards/ppl_reward/std": 8.61094856262207, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 100.46875, "completions/mean_terminated_length": 100.46875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.07554066402680475, "grad_norm": 2.912839651107788, "kl": 0.23681640625, "learning_rate": 2.975609756097561e-06, "loss": 0.0356, "num_tokens": 808215.0, "reward": -0.7896728515625, "reward_std": 0.21826709806919098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.579345703125, "rewards/ppl_reward/std": 1.5208570957183838, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 115.03125, "completions/mean_terminated_length": 115.03125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0767590618336887, "grad_norm": 2.644758701324463, "kl": 0.202392578125, "learning_rate": 3.024390243902439e-06, "loss": -0.0235, "num_tokens": 823273.0, "reward": -3.447265625, "reward_std": 0.7828366756439209, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -10.85546875, "rewards/ppl_reward/std": 14.532094955444336, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 110.828125, "completions/mean_terminated_length": 110.828125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.07797745964057265, "grad_norm": 2.572784423828125, "kl": 0.220458984375, "learning_rate": 3.0731707317073173e-06, "loss": -0.0551, "num_tokens": 837718.0, "reward": -0.7130126953125, "reward_std": 0.21769209206104279, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.426025390625, "rewards/ppl_reward/std": 1.7765341997146606, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 118.90625, "completions/mean_terminated_length": 118.90625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0791958574474566, "grad_norm": 2.6919708251953125, "kl": 0.18798828125, "learning_rate": 3.1219512195121954e-06, "loss": -0.0544, "num_tokens": 852632.0, "reward": -2.283935546875, "reward_std": 0.5029987692832947, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.56787109375, "rewards/ppl_reward/std": 7.67180061340332, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 112.171875, "completions/mean_terminated_length": 112.171875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.08041425525434054, "grad_norm": 2.736766815185547, "kl": 0.22265625, "learning_rate": 3.1707317073170736e-06, "loss": 0.0367, "num_tokens": 866995.0, "reward": -2.140380859375, "reward_std": 0.6666809916496277, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -8.14794921875, "rewards/ppl_reward/std": 2.8183414936065674, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 110.328125, "completions/mean_terminated_length": 110.328125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.08163265306122448, "grad_norm": 2.6457228660583496, "kl": 0.213623046875, "learning_rate": 3.2195121951219517e-06, "loss": -0.0963, "num_tokens": 880840.0, "reward": -2.6427001953125, "reward_std": 0.5287624597549438, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -9.160400390625, "rewards/ppl_reward/std": 9.910276412963867, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 141.34375, "completions/mean_terminated_length": 141.34375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.08285105086810844, "grad_norm": 2.3984920978546143, "kl": 0.19140625, "learning_rate": 3.26829268292683e-06, "loss": -0.1022, "num_tokens": 897198.0, "reward": -1.8018798828125, "reward_std": 0.37642520666122437, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -7.517822265625, "rewards/ppl_reward/std": 3.147583246231079, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 124.515625, "completions/mean_terminated_length": 124.515625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.08406944867499239, "grad_norm": 2.967463731765747, "kl": 0.193603515625, "learning_rate": 3.3170731707317076e-06, "loss": 0.1059, "num_tokens": 911607.0, "reward": -0.4376220703125, "reward_std": 0.4323465824127197, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.750244140625, "rewards/ppl_reward/std": 1.7478851079940796, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 125.703125, "completions/mean_terminated_length": 125.703125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.08528784648187633, "grad_norm": 2.568765163421631, "kl": 0.18896484375, "learning_rate": 3.3658536585365857e-06, "loss": -0.0383, "num_tokens": 926556.0, "reward": -4.7393798828125, "reward_std": 1.1087034940719604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -13.478759765625, "rewards/ppl_reward/std": 20.929649353027344, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 125.546875, "completions/mean_terminated_length": 125.546875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.08650624428876028, "grad_norm": 2.994828224182129, "kl": 0.189697265625, "learning_rate": 3.414634146341464e-06, "loss": 0.1431, "num_tokens": 940903.0, "reward": -1.229736328125, "reward_std": 0.31381306052207947, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.44384765625, "rewards/ppl_reward/std": 4.649298667907715, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 150.984375, "completions/mean_terminated_length": 150.984375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.08772464209564423, "grad_norm": 2.123347282409668, "kl": 0.1474609375, "learning_rate": 3.4634146341463416e-06, "loss": -0.0233, "num_tokens": 958630.0, "reward": -1.7239990234375, "reward_std": 0.4750729501247406, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.447998046875, "rewards/ppl_reward/std": 7.225244998931885, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 123.046875, "completions/mean_terminated_length": 123.046875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.08894303990252818, "grad_norm": 2.5198819637298584, "kl": 0.187744140625, "learning_rate": 3.5121951219512197e-06, "loss": -0.0639, "num_tokens": 973193.0, "reward": -2.973388671875, "reward_std": 0.5177909731864929, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.93115234375, "rewards/ppl_reward/std": 5.53569221496582, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 131.828125, "completions/mean_terminated_length": 131.828125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.09016143770941212, "grad_norm": 2.465074300765991, "kl": 0.167236328125, "learning_rate": 3.560975609756098e-06, "loss": -0.0295, "num_tokens": 988294.0, "reward": -1.484375, "reward_std": 0.4390271008014679, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.96875, "rewards/ppl_reward/std": 3.227205276489258, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 124.921875, "completions/mean_terminated_length": 124.921875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.09137983551629607, "grad_norm": 2.446629762649536, "kl": 0.175537109375, "learning_rate": 3.609756097560976e-06, "loss": 0.0442, "num_tokens": 1003145.0, "reward": -1.9500732421875, "reward_std": 0.385883629322052, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -7.814208984375, "rewards/ppl_reward/std": 5.578749656677246, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 136.4375, "completions/mean_terminated_length": 136.4375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.09259823332318001, "grad_norm": 2.2633533477783203, "kl": 0.1591796875, "learning_rate": 3.6585365853658537e-06, "loss": 0.002, "num_tokens": 1018637.0, "reward": -1.8995361328125, "reward_std": 0.407083123922348, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.799072265625, "rewards/ppl_reward/std": 3.4443395137786865, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 138.9375, "completions/mean_terminated_length": 138.9375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.09381663113006397, "grad_norm": 2.245643138885498, "kl": 0.1416015625, "learning_rate": 3.707317073170732e-06, "loss": -0.0383, "num_tokens": 1034633.0, "reward": -1.3768310546875, "reward_std": 0.2919788360595703, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.753662109375, "rewards/ppl_reward/std": 3.0252017974853516, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 138.78125, "completions/mean_terminated_length": 138.78125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.09503502893694792, "grad_norm": 2.3781557083129883, "kl": 0.1484375, "learning_rate": 3.75609756097561e-06, "loss": -0.0269, "num_tokens": 1050979.0, "reward": -1.92236328125, "reward_std": 0.38744449615478516, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.8447265625, "rewards/ppl_reward/std": 5.701778888702393, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 153.453125, "completions/mean_terminated_length": 153.453125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.09625342674383186, "grad_norm": 2.20378041267395, "kl": 0.14599609375, "learning_rate": 3.804878048780488e-06, "loss": -0.0033, "num_tokens": 1068856.0, "reward": -0.833740234375, "reward_std": 0.1177571564912796, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.66748046875, "rewards/ppl_reward/std": 2.988515615463257, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 142.0625, "completions/mean_terminated_length": 142.0625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.0974718245507158, "grad_norm": 2.2765793800354004, "kl": 0.1689453125, "learning_rate": 3.853658536585366e-06, "loss": -0.037, "num_tokens": 1085052.0, "reward": -0.7867431640625, "reward_std": 0.22862447798252106, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.573486328125, "rewards/ppl_reward/std": 2.58359694480896, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 137.046875, "completions/mean_terminated_length": 137.046875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.09869022235759975, "grad_norm": 2.3604187965393066, "kl": 0.1669921875, "learning_rate": 3.902439024390244e-06, "loss": 0.0672, "num_tokens": 1101271.0, "reward": -1.3935546875, "reward_std": 0.22249147295951843, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.787109375, "rewards/ppl_reward/std": 4.56063175201416, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 135.484375, "completions/mean_terminated_length": 135.484375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.09990862016448371, "grad_norm": 2.484464645385742, "kl": 0.174560546875, "learning_rate": 3.9512195121951225e-06, "loss": 0.0582, "num_tokens": 1116862.0, "reward": -3.604248046875, "reward_std": 1.6792032718658447, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -11.20849609375, "rewards/ppl_reward/std": 17.431610107421875, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 122.96875, "completions/mean_terminated_length": 122.96875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.10112701797136765, "grad_norm": 2.413614273071289, "kl": 0.174560546875, "learning_rate": 4.000000000000001e-06, "loss": 0.002, "num_tokens": 1131708.0, "reward": -0.8826904296875, "reward_std": 0.2670535743236542, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.726318359375, "rewards/ppl_reward/std": 2.8590002059936523, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 136.078125, "completions/mean_terminated_length": 136.078125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.1023454157782516, "grad_norm": 2.361656665802002, "kl": 0.150390625, "learning_rate": 4.048780487804879e-06, "loss": -0.0206, "num_tokens": 1147553.0, "reward": -1.357421875, "reward_std": 0.19573107361793518, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.71484375, "rewards/ppl_reward/std": 2.931166410446167, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 130.890625, "completions/mean_terminated_length": 130.890625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.10356381358513554, "grad_norm": 2.4765477180480957, "kl": 0.1796875, "learning_rate": 4.097560975609756e-06, "loss": 0.009, "num_tokens": 1163202.0, "reward": -1.127197265625, "reward_std": 0.2529106140136719, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.21533203125, "rewards/ppl_reward/std": 4.605705738067627, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 146.09375, "completions/mean_terminated_length": 146.09375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.1047822113920195, "grad_norm": 2.175280809402466, "kl": 0.1446533203125, "learning_rate": 4.146341463414634e-06, "loss": 0.0486, "num_tokens": 1180224.0, "reward": -0.736083984375, "reward_std": 0.28174659609794617, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.43310546875, "rewards/ppl_reward/std": 2.367628812789917, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 104.078125, "completions/mean_terminated_length": 104.078125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.10600060919890344, "grad_norm": 2.423334836959839, "kl": 0.17919921875, "learning_rate": 4.195121951219512e-06, "loss": 0.007, "num_tokens": 1193125.0, "reward": -0.623291015625, "reward_std": 0.19587835669517517, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.24658203125, "rewards/ppl_reward/std": 2.900439500808716, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 121.859375, "completions/mean_terminated_length": 121.859375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.10721900700578739, "grad_norm": 2.3266818523406982, "kl": 0.1510009765625, "learning_rate": 4.2439024390243905e-06, "loss": 0.0618, "num_tokens": 1207484.0, "reward": -2.9029541015625, "reward_std": 0.4861726760864258, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -9.774658203125, "rewards/ppl_reward/std": 5.26531982421875, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 120.453125, "completions/mean_terminated_length": 120.453125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.10843740481267133, "grad_norm": 2.3581535816192627, "kl": 0.1494140625, "learning_rate": 4.292682926829269e-06, "loss": -0.0582, "num_tokens": 1222433.0, "reward": -1.065185546875, "reward_std": 0.2714100480079651, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.13037109375, "rewards/ppl_reward/std": 2.9691576957702637, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 115.609375, "completions/mean_terminated_length": 115.609375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.10965580261955528, "grad_norm": 2.47957444190979, "kl": 0.16259765625, "learning_rate": 4.341463414634147e-06, "loss": 0.0149, "num_tokens": 1236656.0, "reward": -1.2393798828125, "reward_std": 0.32371005415916443, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.478759765625, "rewards/ppl_reward/std": 4.1541595458984375, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 125.03125, "completions/mean_terminated_length": 125.03125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.11087420042643924, "grad_norm": 2.3542559146881104, "kl": 0.1640625, "learning_rate": 4.390243902439025e-06, "loss": 0.071, "num_tokens": 1251466.0, "reward": -0.6334228515625, "reward_std": 0.20680001378059387, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.266845703125, "rewards/ppl_reward/std": 1.8854297399520874, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 110.0625, "completions/mean_terminated_length": 110.0625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.11209259823332318, "grad_norm": 2.513343095779419, "kl": 0.187744140625, "learning_rate": 4.439024390243903e-06, "loss": 0.0007, "num_tokens": 1265558.0, "reward": -2.161865234375, "reward_std": 0.45063847303390503, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -8.28466796875, "rewards/ppl_reward/std": 6.98577356338501, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 96.625, "completions/mean_terminated_length": 96.625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.11331099604020713, "grad_norm": 2.9058289527893066, "kl": 0.232177734375, "learning_rate": 4.487804878048781e-06, "loss": -0.0243, "num_tokens": 1278262.0, "reward": -2.7139892578125, "reward_std": 0.5498888492584229, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.427978515625, "rewards/ppl_reward/std": 7.170175552368164, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 105.625, "completions/mean_terminated_length": 105.625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.11452939384709107, "grad_norm": 2.6539242267608643, "kl": 0.22998046875, "learning_rate": 4.536585365853659e-06, "loss": -0.0023, "num_tokens": 1292206.0, "reward": -1.652587890625, "reward_std": 0.23529165983200073, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.30517578125, "rewards/ppl_reward/std": 3.153637647628784, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 100.328125, "completions/mean_terminated_length": 100.328125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.11574779165397503, "grad_norm": 2.5904741287231445, "kl": 0.245361328125, "learning_rate": 4.5853658536585375e-06, "loss": -0.057, "num_tokens": 1305843.0, "reward": -2.5040283203125, "reward_std": 0.46247848868370056, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.008056640625, "rewards/ppl_reward/std": 7.036736488342285, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 90.109375, "completions/mean_terminated_length": 90.109375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.11696618946085897, "grad_norm": 3.06960129737854, "kl": 0.277099609375, "learning_rate": 4.634146341463416e-06, "loss": -0.0032, "num_tokens": 1317866.0, "reward": -1.6361083984375, "reward_std": 0.2670316696166992, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.233154296875, "rewards/ppl_reward/std": 3.6345012187957764, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 100.125, "completions/mean_terminated_length": 100.125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.11818458726774292, "grad_norm": 2.495875835418701, "kl": 0.258056640625, "learning_rate": 4.682926829268293e-06, "loss": -0.02, "num_tokens": 1331634.0, "reward": -1.1365966796875, "reward_std": 0.230637326836586, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.273193359375, "rewards/ppl_reward/std": 2.8544273376464844, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 103.71875, "completions/mean_terminated_length": 103.71875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.11940298507462686, "grad_norm": 2.733351230621338, "kl": 0.2763671875, "learning_rate": 4.731707317073171e-06, "loss": -0.0217, "num_tokens": 1345344.0, "reward": -0.45849609375, "reward_std": 0.231999009847641, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -4.8779296875, "rewards/ppl_reward/std": 2.180765151977539, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 96.09375, "completions/mean_terminated_length": 96.09375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.12062138288151081, "grad_norm": 3.145207643508911, "kl": 0.29931640625, "learning_rate": 4.780487804878049e-06, "loss": 0.0095, "num_tokens": 1358062.0, "reward": -2.5882568359375, "reward_std": 0.5628554821014404, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -9.129638671875, "rewards/ppl_reward/std": 6.747293472290039, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 112.28125, "completions/mean_terminated_length": 112.28125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.12183978068839477, "grad_norm": 2.603081226348877, "kl": 0.271728515625, "learning_rate": 4.829268292682927e-06, "loss": 0.0261, "num_tokens": 1373264.0, "reward": -1.009033203125, "reward_std": 0.5926703810691833, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.01806640625, "rewards/ppl_reward/std": 3.9307448863983154, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 112.90625, "completions/mean_terminated_length": 112.90625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.12305817849527871, "grad_norm": 2.3285279273986816, "kl": 0.30078125, "learning_rate": 4.8780487804878055e-06, "loss": -0.0636, "num_tokens": 1387538.0, "reward": -3.985107421875, "reward_std": 0.6594923734664917, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -11.97021484375, "rewards/ppl_reward/std": 12.641143798828125, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 110.75, "completions/mean_terminated_length": 110.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.12427657630216266, "grad_norm": 2.5687856674194336, "kl": 0.318359375, "learning_rate": 4.926829268292684e-06, "loss": -0.052, "num_tokens": 1401306.0, "reward": -2.72308349609375, "reward_std": 0.6631431579589844, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.4461669921875, "rewards/ppl_reward/std": 8.491022109985352, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 120.703125, "completions/mean_terminated_length": 120.703125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.12549497410904661, "grad_norm": 2.9909133911132812, "kl": 0.28466796875, "learning_rate": 4.975609756097562e-06, "loss": -0.0038, "num_tokens": 1415543.0, "reward": -1.0618896484375, "reward_std": 0.3246648609638214, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.076904296875, "rewards/ppl_reward/std": 3.4602315425872803, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 144.359375, "completions/mean_terminated_length": 144.359375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.12671337191593054, "grad_norm": 2.359078884124756, "kl": 0.268798828125, "learning_rate": 5.024390243902439e-06, "loss": -0.0426, "num_tokens": 1432022.0, "reward": -2.148193359375, "reward_std": 0.45882701873779297, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.29638671875, "rewards/ppl_reward/std": 7.760997772216797, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 177.15625, "completions/mean_terminated_length": 177.15625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.1279317697228145, "grad_norm": 14.759847640991211, "kl": 0.552001953125, "learning_rate": 5.073170731707317e-06, "loss": -0.0229, "num_tokens": 1451424.0, "reward": -0.375244140625, "reward_std": 0.20465204119682312, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -4.71142578125, "rewards/ppl_reward/std": 1.5494518280029297, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 165.515625, "completions/mean_terminated_length": 165.515625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.12915016752969843, "grad_norm": 2.076524257659912, "kl": 0.24072265625, "learning_rate": 5.121951219512195e-06, "loss": -0.0187, "num_tokens": 1468753.0, "reward": -2.003173828125, "reward_std": 0.6437720060348511, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.89697265625, "rewards/ppl_reward/std": 4.953807830810547, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 188.21875, "completions/mean_terminated_length": 188.21875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.1303685653365824, "grad_norm": 2.1544289588928223, "kl": 0.25146484375, "learning_rate": 5.1707317073170735e-06, "loss": 0.0773, "num_tokens": 1487647.0, "reward": -0.34710693359375, "reward_std": 0.5436285138130188, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.5457763671875, "rewards/ppl_reward/std": 2.7334582805633545, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 231.15625, "completions/mean_terminated_length": 231.15625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.13158696314346635, "grad_norm": 1.8865766525268555, "kl": 0.2236328125, "learning_rate": 5.219512195121952e-06, "loss": 0.0323, "num_tokens": 1510449.0, "reward": -1.99658203125, "reward_std": 0.391140878200531, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.8525390625, "rewards/ppl_reward/std": 6.184365272521973, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 227.078125, "completions/mean_terminated_length": 227.078125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.13280536095035028, "grad_norm": 1.869513988494873, "kl": 0.21484375, "learning_rate": 5.26829268292683e-06, "loss": -0.0188, "num_tokens": 1531462.0, "reward": -4.083984375, "reward_std": 0.6808915138244629, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -12.12890625, "rewards/ppl_reward/std": 9.07442569732666, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 264.1875, "completions/mean_terminated_length": 264.1875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.13402375875723424, "grad_norm": 2.0477614402770996, "kl": 0.21728515625, "learning_rate": 5.317073170731708e-06, "loss": 0.0916, "num_tokens": 1554906.0, "reward": -3.0556640625, "reward_std": 0.6285160779953003, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -9.845703125, "rewards/ppl_reward/std": 8.324163436889648, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07344620674848557, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 264.21875, "completions/mean_terminated_length": 264.21875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.13524215656411817, "grad_norm": 1.586763858795166, "kl": 0.197509765625, "learning_rate": 5.365853658536586e-06, "loss": 0.0, "num_tokens": 1579664.0, "reward": -1.0709228515625, "reward_std": 0.4563199281692505, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.719970703125, "rewards/ppl_reward/std": 2.756974220275879, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.13706642389297485, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 254.890625, "completions/mean_terminated_length": 254.890625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.13646055437100213, "grad_norm": 1.7437471151351929, "kl": 0.216796875, "learning_rate": 5.414634146341464e-06, "loss": 0.0713, "num_tokens": 1602505.0, "reward": -1.147705078125, "reward_std": 0.7188407182693481, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -5.68603515625, "rewards/ppl_reward/std": 2.438418388366699, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.12962667644023895, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 263.59375, "completions/mean_terminated_length": 263.59375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.1376789521778861, "grad_norm": 1.7775447368621826, "kl": 0.192138671875, "learning_rate": 5.463414634146342e-06, "loss": 0.0232, "num_tokens": 1625991.0, "reward": -2.81109619140625, "reward_std": 1.1232812404632568, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -9.3175048828125, "rewards/ppl_reward/std": 5.919703960418701, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11404092609882355, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 259.78125, "completions/mean_terminated_length": 259.78125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.13889734998477002, "grad_norm": 1.5596420764923096, "kl": 0.196044921875, "learning_rate": 5.5121951219512205e-06, "loss": 0.0423, "num_tokens": 1650545.0, "reward": -1.5301513671875, "reward_std": 0.7102817296981812, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.638427734375, "rewards/ppl_reward/std": 3.031449317932129, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 213.046875, "completions/mean_terminated_length": 213.046875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.14011574779165398, "grad_norm": 1.872291088104248, "kl": 0.208740234375, "learning_rate": 5.560975609756099e-06, "loss": 0.0275, "num_tokens": 1670948.0, "reward": -2.53564453125, "reward_std": 0.8143570423126221, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -8.6337890625, "rewards/ppl_reward/std": 8.923409461975098, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.0833333358168602, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 232.90625, "completions/mean_terminated_length": 232.90625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.14133414559853794, "grad_norm": 1.9268497228622437, "kl": 0.21044921875, "learning_rate": 5.609756097560977e-06, "loss": 0.003, "num_tokens": 1692182.0, "reward": -2.6514892578125, "reward_std": 0.963127851486206, "rewards/format_reward/mean": 0.484375, "rewards/format_reward/std": 0.5037065148353577, "rewards/ppl_reward/mean": -7.849853515625, "rewards/ppl_reward/std": 6.896634101867676, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.21463678777217865, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 206.0, "completions/mean_terminated_length": 206.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.14255254340542187, "grad_norm": 1.9238383769989014, "kl": 0.202880859375, "learning_rate": 5.658536585365853e-06, "loss": -0.042, "num_tokens": 1712742.0, "reward": -0.71337890625, "reward_std": 0.5520173907279968, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -4.8408203125, "rewards/ppl_reward/std": 2.335536003112793, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.15861308574676514, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 223.6875, "completions/mean_terminated_length": 223.6875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.14377094121230583, "grad_norm": 1.7316782474517822, "kl": 0.195556640625, "learning_rate": 5.707317073170731e-06, "loss": -0.0194, "num_tokens": 1733994.0, "reward": -3.0706787109375, "reward_std": 0.8990435004234314, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -10.071044921875, "rewards/ppl_reward/std": 6.6969499588012695, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 194.609375, "completions/mean_terminated_length": 194.609375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.14498933901918976, "grad_norm": 1.8995025157928467, "kl": 0.208740234375, "learning_rate": 5.7560975609756095e-06, "loss": 0.0292, "num_tokens": 1753585.0, "reward": -3.02642822265625, "reward_std": 0.5320390462875366, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -10.0059814453125, "rewards/ppl_reward/std": 8.904060363769531, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 196.75, "completions/mean_terminated_length": 196.75, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.14620773682607371, "grad_norm": 2.0618348121643066, "kl": 0.230712890625, "learning_rate": 5.804878048780488e-06, "loss": 0.0298, "num_tokens": 1773401.0, "reward": -1.341796875, "reward_std": 0.4077373147010803, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.54296875, "rewards/ppl_reward/std": 2.4443204402923584, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 161.46875, "completions/mean_terminated_length": 161.46875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.14742613463295767, "grad_norm": 2.1044681072235107, "kl": 0.22900390625, "learning_rate": 5.853658536585366e-06, "loss": -0.0227, "num_tokens": 1790791.0, "reward": -1.7254638671875, "reward_std": 0.4350489377975464, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -7.372802734375, "rewards/ppl_reward/std": 5.163678169250488, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 152.640625, "completions/mean_terminated_length": 152.640625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.1486445324398416, "grad_norm": 2.0767297744750977, "kl": 0.215576171875, "learning_rate": 5.902439024390244e-06, "loss": 0.0118, "num_tokens": 1807280.0, "reward": -1.4130859375, "reward_std": 0.3195490837097168, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.787109375, "rewards/ppl_reward/std": 1.954596757888794, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 130.125, "completions/mean_terminated_length": 130.125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.14986293024672556, "grad_norm": 2.2168071269989014, "kl": 0.240478515625, "learning_rate": 5.951219512195122e-06, "loss": 0.0384, "num_tokens": 1822080.0, "reward": -10.2421875, "reward_std": 1.683593988418579, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -24.453125, "rewards/ppl_reward/std": 27.821474075317383, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 141.34375, "completions/mean_terminated_length": 141.34375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.1510813280536095, "grad_norm": 1.9768743515014648, "kl": 0.197509765625, "learning_rate": 6e-06, "loss": -0.0233, "num_tokens": 1838790.0, "reward": -0.4232177734375, "reward_std": 0.16309769451618195, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -4.807373046875, "rewards/ppl_reward/std": 2.101675271987915, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 115.21875, "completions/mean_terminated_length": 115.21875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.15229972586049345, "grad_norm": 2.303302049636841, "kl": 0.227294921875, "learning_rate": 6.048780487804878e-06, "loss": -0.0244, "num_tokens": 1852764.0, "reward": -1.6539306640625, "reward_std": 0.44396328926086426, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.260986328125, "rewards/ppl_reward/std": 4.137269496917725, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 113.515625, "completions/mean_terminated_length": 113.515625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.1535181236673774, "grad_norm": 2.061652421951294, "kl": 0.26025390625, "learning_rate": 6.0975609756097564e-06, "loss": -0.0167, "num_tokens": 1867045.0, "reward": -3.3330078125, "reward_std": 0.888390064239502, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -10.626953125, "rewards/ppl_reward/std": 10.879913330078125, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 111.484375, "completions/mean_terminated_length": 111.484375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.15473652147426134, "grad_norm": 2.39591383934021, "kl": 0.24609375, "learning_rate": 6.1463414634146346e-06, "loss": -0.0492, "num_tokens": 1881380.0, "reward": -0.703857421875, "reward_std": 0.19792896509170532, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.40771484375, "rewards/ppl_reward/std": 1.539925456047058, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 104.484375, "completions/mean_terminated_length": 104.484375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.1559549192811453, "grad_norm": 2.4163124561309814, "kl": 0.26220703125, "learning_rate": 6.195121951219513e-06, "loss": -0.0687, "num_tokens": 1894723.0, "reward": -2.841064453125, "reward_std": 0.5910916328430176, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -9.60400390625, "rewards/ppl_reward/std": 6.950483322143555, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 103.875, "completions/mean_terminated_length": 103.875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.15717331708802923, "grad_norm": 2.613079786300659, "kl": 0.3388671875, "learning_rate": 6.243902439024391e-06, "loss": 0.0021, "num_tokens": 1908123.0, "reward": -0.9913330078125, "reward_std": 0.3640269339084625, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.904541015625, "rewards/ppl_reward/std": 1.604505181312561, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 101.671875, "completions/mean_terminated_length": 101.671875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.1583917148949132, "grad_norm": 2.5388338565826416, "kl": 0.293212890625, "learning_rate": 6.292682926829269e-06, "loss": 0.0109, "num_tokens": 1920990.0, "reward": -1.0955810546875, "reward_std": 0.3058258891105652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.191162109375, "rewards/ppl_reward/std": 2.888573408126831, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 116.21875, "completions/mean_terminated_length": 116.21875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.15961011270179715, "grad_norm": 2.725884437561035, "kl": 0.275390625, "learning_rate": 6.341463414634147e-06, "loss": -0.0758, "num_tokens": 1935668.0, "reward": -1.1768798828125, "reward_std": 0.3090110421180725, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.267822265625, "rewards/ppl_reward/std": 4.489551544189453, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 103.5, "completions/mean_terminated_length": 103.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.16082851050868108, "grad_norm": 2.5242886543273926, "kl": 0.3212890625, "learning_rate": 6.390243902439025e-06, "loss": -0.0319, "num_tokens": 1948980.0, "reward": -1.8701171875, "reward_std": 0.6071804165840149, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.513671875, "rewards/ppl_reward/std": 4.890556335449219, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1249379813671112, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 120.78125, "completions/mean_terminated_length": 120.78125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.16204690831556504, "grad_norm": 2.3547093868255615, "kl": 0.33203125, "learning_rate": 6.439024390243903e-06, "loss": -0.0473, "num_tokens": 1963822.0, "reward": -1.0965576171875, "reward_std": 0.2644021511077881, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.114990234375, "rewards/ppl_reward/std": 2.590660572052002, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 132.96875, "completions/mean_terminated_length": 132.96875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.16326530612244897, "grad_norm": 2.0764129161834717, "kl": 0.28466796875, "learning_rate": 6.4878048780487815e-06, "loss": 0.0008, "num_tokens": 1979740.0, "reward": -2.0211181640625, "reward_std": 0.51406329870224, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.714111328125, "rewards/ppl_reward/std": 5.516513347625732, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 127.296875, "completions/mean_terminated_length": 127.296875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.16448370392933293, "grad_norm": 2.273808717727661, "kl": 0.33984375, "learning_rate": 6.53658536585366e-06, "loss": 0.0626, "num_tokens": 1994423.0, "reward": -0.9581298828125, "reward_std": 0.5006543397903442, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.541259765625, "rewards/ppl_reward/std": 1.7233376502990723, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.125, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 127.671875, "completions/mean_terminated_length": 127.671875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.16570210173621688, "grad_norm": 2.290184736251831, "kl": 0.33935546875, "learning_rate": 6.585365853658538e-06, "loss": -0.0541, "num_tokens": 2009514.0, "reward": -2.3582763671875, "reward_std": 0.5439121723175049, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -8.591552734375, "rewards/ppl_reward/std": 9.26832103729248, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 144.375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.16692049954310081, "grad_norm": 2.2230916023254395, "kl": 0.31591796875, "learning_rate": 6.634146341463415e-06, "loss": 0.0172, "num_tokens": 2025834.0, "reward": -2.4544677734375, "reward_std": 1.411651611328125, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.510498046875, "rewards/ppl_reward/std": 9.05359172821045, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16512493789196014, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 148.96875, "completions/mean_terminated_length": 148.96875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.16813889734998477, "grad_norm": 2.1264913082122803, "kl": 0.30712890625, "learning_rate": 6.682926829268293e-06, "loss": -0.0102, "num_tokens": 2042152.0, "reward": -0.836181640625, "reward_std": 0.34789520502090454, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.51611328125, "rewards/ppl_reward/std": 1.7022954225540161, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 140.453125, "completions/mean_terminated_length": 140.453125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.16935729515686873, "grad_norm": 1.8707163333892822, "kl": 0.33837890625, "learning_rate": 6.731707317073171e-06, "loss": -0.0266, "num_tokens": 2057197.0, "reward": -9.60760498046875, "reward_std": 8.319808959960938, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -23.1761474609375, "rewards/ppl_reward/std": 59.54058837890625, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 151.9375, "completions/mean_terminated_length": 151.9375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.17057569296375266, "grad_norm": 1.949610948562622, "kl": 0.3466796875, "learning_rate": 6.7804878048780495e-06, "loss": -0.0421, "num_tokens": 2074129.0, "reward": -0.618408203125, "reward_std": 0.2893338203430176, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.14306640625, "rewards/ppl_reward/std": 1.9013774394989014, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08768405020236969, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 159.875, "completions/mean_terminated_length": 159.875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.17179409077063662, "grad_norm": 2.035886287689209, "kl": 0.3115234375, "learning_rate": 6.829268292682928e-06, "loss": -0.0435, "num_tokens": 2090785.0, "reward": -1.62841796875, "reward_std": 0.4563731551170349, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -7.1943359375, "rewards/ppl_reward/std": 2.7286269664764404, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 174.96875, "completions/mean_terminated_length": 174.96875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.17301248857752055, "grad_norm": 1.894422173500061, "kl": 0.30078125, "learning_rate": 6.878048780487805e-06, "loss": 0.1109, "num_tokens": 2109359.0, "reward": -7.8280029296875, "reward_std": 1.3961737155914307, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -19.460693359375, "rewards/ppl_reward/std": 27.56568717956543, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 199.4375, "completions/mean_terminated_length": 199.4375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.1742308863844045, "grad_norm": 1.8174607753753662, "kl": 0.281005859375, "learning_rate": 6.926829268292683e-06, "loss": -0.1095, "num_tokens": 2129171.0, "reward": -0.7041015625, "reward_std": 0.4796181321144104, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.205078125, "rewards/ppl_reward/std": 3.2879106998443604, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 211.015625, "completions/mean_terminated_length": 211.015625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.17544928419128847, "grad_norm": 1.8666101694107056, "kl": 0.2724609375, "learning_rate": 6.975609756097561e-06, "loss": -0.1084, "num_tokens": 2150636.0, "reward": -2.3641357421875, "reward_std": 0.29891878366470337, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -8.657958984375, "rewards/ppl_reward/std": 4.3074116706848145, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 212.40625, "completions/mean_terminated_length": 212.40625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.1766676819981724, "grad_norm": 1.7843928337097168, "kl": 0.28515625, "learning_rate": 7.024390243902439e-06, "loss": -0.0127, "num_tokens": 2171054.0, "reward": -2.8902587890625, "reward_std": 1.2389473915100098, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -9.639892578125, "rewards/ppl_reward/std": 7.569603443145752, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 227.25, "completions/mean_terminated_length": 227.25, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.17788607980505636, "grad_norm": 1.7108807563781738, "kl": 0.279296875, "learning_rate": 7.0731707317073175e-06, "loss": -0.1075, "num_tokens": 2192542.0, "reward": -2.2249755859375, "reward_std": 0.7524539828300476, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.293701171875, "rewards/ppl_reward/std": 8.712157249450684, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 199.828125, "completions/mean_terminated_length": 199.828125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.1791044776119403, "grad_norm": 1.8178476095199585, "kl": 0.32421875, "learning_rate": 7.121951219512196e-06, "loss": -0.1005, "num_tokens": 2211451.0, "reward": -0.77667236328125, "reward_std": 0.258938729763031, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.5533447265625, "rewards/ppl_reward/std": 2.3659679889678955, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 273.140625, "completions/mean_terminated_length": 248.91934204101562, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.18032287541882425, "grad_norm": 1.530160665512085, "kl": 0.241943359375, "learning_rate": 7.170731707317074e-06, "loss": 0.1241, "num_tokens": 2236212.0, "reward": -0.6563720703125, "reward_std": 0.5002977252006531, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.179931640625, "rewards/ppl_reward/std": 2.789236545562744, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 292.375, "completions/mean_terminated_length": 268.7742004394531, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.1815412732257082, "grad_norm": 1.7426680326461792, "kl": 0.232177734375, "learning_rate": 7.219512195121952e-06, "loss": 0.2698, "num_tokens": 2261772.0, "reward": -1.665771484375, "reward_std": 0.7386971712112427, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.13623046875, "rewards/ppl_reward/std": 4.355805397033691, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 283.484375, "completions/mean_terminated_length": 259.5967712402344, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.18275967103259214, "grad_norm": 1.5331463813781738, "kl": 0.237060546875, "learning_rate": 7.268292682926829e-06, "loss": 0.1971, "num_tokens": 2286451.0, "reward": -2.6693115234375, "reward_std": 0.8593677282333374, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -9.174560546875, "rewards/ppl_reward/std": 5.831031322479248, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 248.96875, "completions/mean_terminated_length": 248.96875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.1839780688394761, "grad_norm": 1.5464032888412476, "kl": 0.255615234375, "learning_rate": 7.317073170731707e-06, "loss": -0.0847, "num_tokens": 2309281.0, "reward": -2.6060791015625, "reward_std": 0.5082137584686279, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -9.134033203125, "rewards/ppl_reward/std": 6.209072589874268, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 283.78125, "completions/mean_terminated_length": 283.78125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.18519646664636003, "grad_norm": 1.363835334777832, "kl": 0.24169921875, "learning_rate": 7.3658536585365855e-06, "loss": -0.0485, "num_tokens": 2335331.0, "reward": -2.186767578125, "reward_std": 0.6422680616378784, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.13134765625, "rewards/ppl_reward/std": 4.814605236053467, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 219.3125, "completions/mean_terminated_length": 219.3125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.18641486445324398, "grad_norm": 1.7556971311569214, "kl": 0.268310546875, "learning_rate": 7.414634146341464e-06, "loss": 0.0099, "num_tokens": 2356175.0, "reward": -0.982421875, "reward_std": 0.23654048144817352, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.93359375, "rewards/ppl_reward/std": 2.5713045597076416, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 226.609375, "completions/mean_terminated_length": 226.609375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.18763326226012794, "grad_norm": 1.7704877853393555, "kl": 0.24853515625, "learning_rate": 7.463414634146342e-06, "loss": 0.0181, "num_tokens": 2378246.0, "reward": -0.66204833984375, "reward_std": 0.22815804183483124, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.3240966796875, "rewards/ppl_reward/std": 3.175370931625366, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 239.8125, "completions/mean_terminated_length": 227.36509704589844, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.18885166006701187, "grad_norm": 1.529033899307251, "kl": 0.22412109375, "learning_rate": 7.51219512195122e-06, "loss": 0.1346, "num_tokens": 2400674.0, "reward": -2.9837646484375, "reward_std": 0.5440269708633423, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -9.928466796875, "rewards/ppl_reward/std": 7.677490711212158, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 242.84375, "completions/mean_terminated_length": 242.84375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.19007005787389583, "grad_norm": 1.4848034381866455, "kl": 0.23193359375, "learning_rate": 7.560975609756098e-06, "loss": -0.0582, "num_tokens": 2423432.0, "reward": -0.9073486328125, "reward_std": 0.6081538200378418, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.775634765625, "rewards/ppl_reward/std": 2.551365852355957, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 244.265625, "completions/mean_terminated_length": 231.88890075683594, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.19128845568077976, "grad_norm": 1.8721567392349243, "kl": 0.248291015625, "learning_rate": 7.609756097560976e-06, "loss": 0.022, "num_tokens": 2445825.0, "reward": -1.01416015625, "reward_std": 0.3012954294681549, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.8486328125, "rewards/ppl_reward/std": 3.064713478088379, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 265.71875, "completions/mean_terminated_length": 265.71875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.19250685348766372, "grad_norm": 1.6640863418579102, "kl": 0.226806640625, "learning_rate": 7.658536585365855e-06, "loss": -0.1052, "num_tokens": 2470159.0, "reward": -1.9517822265625, "reward_std": 0.32628950476646423, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.856689453125, "rewards/ppl_reward/std": 4.796881675720215, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 313.265625, "completions/mean_terminated_length": 301.984130859375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.19372525129454768, "grad_norm": 1.350239634513855, "kl": 0.189453125, "learning_rate": 7.707317073170732e-06, "loss": 0.0333, "num_tokens": 2499744.0, "reward": -1.08258056640625, "reward_std": 0.4762732982635498, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.0401611328125, "rewards/ppl_reward/std": 3.1466405391693115, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 226.515625, "completions/mean_terminated_length": 226.515625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.1949436491014316, "grad_norm": 1.647099494934082, "kl": 0.23974609375, "learning_rate": 7.75609756097561e-06, "loss": -0.0403, "num_tokens": 2522081.0, "reward": -0.8560791015625, "reward_std": 0.17524845898151398, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.641845703125, "rewards/ppl_reward/std": 2.5438830852508545, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 222.96875, "completions/mean_terminated_length": 222.96875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.19616204690831557, "grad_norm": 1.6764061450958252, "kl": 0.23388671875, "learning_rate": 7.804878048780489e-06, "loss": 0.0416, "num_tokens": 2543999.0, "reward": -1.8193359375, "reward_std": 0.41520190238952637, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -7.560546875, "rewards/ppl_reward/std": 6.69558048248291, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 182.890625, "completions/mean_terminated_length": 182.890625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.1973804447151995, "grad_norm": 1.9887638092041016, "kl": 0.272216796875, "learning_rate": 7.853658536585366e-06, "loss": -0.0274, "num_tokens": 2561792.0, "reward": -2.107666015625, "reward_std": 0.4880565404891968, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -8.18408203125, "rewards/ppl_reward/std": 5.151211261749268, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 213.59375, "completions/mean_terminated_length": 213.59375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.19859884252208346, "grad_norm": 1.7334083318710327, "kl": 0.247802734375, "learning_rate": 7.902439024390245e-06, "loss": 0.0057, "num_tokens": 2582574.0, "reward": -1.5234375, "reward_std": 0.39927950501441956, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.015625, "rewards/ppl_reward/std": 4.332836151123047, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 201.84375, "completions/mean_terminated_length": 201.84375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.19981724032896742, "grad_norm": 1.763810396194458, "kl": 0.251708984375, "learning_rate": 7.951219512195122e-06, "loss": -0.015, "num_tokens": 2602236.0, "reward": -6.83795166015625, "reward_std": 0.8724244236946106, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -17.5977783203125, "rewards/ppl_reward/std": 30.27268409729004, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 176.40625, "completions/mean_terminated_length": 176.40625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.20103563813585135, "grad_norm": 1.906370997428894, "kl": 0.2978515625, "learning_rate": 8.000000000000001e-06, "loss": -0.0189, "num_tokens": 2620678.0, "reward": -1.3291015625, "reward_std": 0.30588847398757935, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.525390625, "rewards/ppl_reward/std": 4.225419521331787, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 184.34375, "completions/mean_terminated_length": 184.34375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2022540359427353, "grad_norm": 2.2092912197113037, "kl": 0.273193359375, "learning_rate": 8.048780487804879e-06, "loss": 0.1555, "num_tokens": 2639164.0, "reward": -2.2705078125, "reward_std": 0.9517076015472412, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.400390625, "rewards/ppl_reward/std": 7.223329544067383, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 171.0, "completions/mean_terminated_length": 171.0, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.20347243374961926, "grad_norm": 1.8748780488967896, "kl": 0.2919921875, "learning_rate": 8.097560975609758e-06, "loss": 0.0443, "num_tokens": 2656796.0, "reward": -0.85528564453125, "reward_std": 0.24279536306858063, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.6480712890625, "rewards/ppl_reward/std": 2.017183780670166, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 175.171875, "completions/mean_terminated_length": 175.171875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.2046908315565032, "grad_norm": 1.820359706878662, "kl": 0.294677734375, "learning_rate": 8.146341463414635e-06, "loss": 0.0102, "num_tokens": 2674783.0, "reward": -0.9984130859375, "reward_std": 0.33713454008102417, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.934326171875, "rewards/ppl_reward/std": 2.27929425239563, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 170.859375, "completions/mean_terminated_length": 170.859375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.20590922936338715, "grad_norm": 1.8888001441955566, "kl": 0.30224609375, "learning_rate": 8.195121951219512e-06, "loss": 0.0225, "num_tokens": 2693206.0, "reward": -0.4534912109375, "reward_std": 0.21669097244739532, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.844482421875, "rewards/ppl_reward/std": 3.2505977153778076, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 148.265625, "completions/mean_terminated_length": 148.265625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.20712762717027108, "grad_norm": 2.0685667991638184, "kl": 0.326171875, "learning_rate": 8.243902439024391e-06, "loss": -0.0313, "num_tokens": 2709319.0, "reward": -8.03955078125, "reward_std": 2.4255259037017822, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -20.0478515625, "rewards/ppl_reward/std": 28.692907333374023, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.20834602497715504, "grad_norm": 1.9326188564300537, "kl": 0.31005859375, "learning_rate": 8.292682926829268e-06, "loss": -0.0402, "num_tokens": 2727647.0, "reward": -4.1785888671875, "reward_std": 0.694019079208374, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -12.357177734375, "rewards/ppl_reward/std": 10.604668617248535, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 144.515625, "completions/mean_terminated_length": 144.515625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.209564422784039, "grad_norm": 2.0811212062835693, "kl": 0.34326171875, "learning_rate": 8.341463414634147e-06, "loss": -0.0529, "num_tokens": 2743664.0, "reward": 0.01226806640625, "reward_std": 0.22908654808998108, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -3.9442138671875, "rewards/ppl_reward/std": 1.6470913887023926, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 150.921875, "completions/mean_terminated_length": 150.921875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.21078282059092293, "grad_norm": 2.0683515071868896, "kl": 0.3212890625, "learning_rate": 8.390243902439025e-06, "loss": 0.0294, "num_tokens": 2760683.0, "reward": -1.493408203125, "reward_std": 0.5891233086585999, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.90087890625, "rewards/ppl_reward/std": 4.772750377655029, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 147.953125, "completions/mean_terminated_length": 147.953125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.2120012183978069, "grad_norm": 2.1137125492095947, "kl": 0.35888671875, "learning_rate": 8.439024390243902e-06, "loss": 0.0241, "num_tokens": 2777104.0, "reward": -2.1143798828125, "reward_std": 0.8603163957595825, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -8.150634765625, "rewards/ppl_reward/std": 5.60657262802124, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 145.15625, "completions/mean_terminated_length": 145.15625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.21321961620469082, "grad_norm": 2.070632219314575, "kl": 0.34912109375, "learning_rate": 8.487804878048781e-06, "loss": -0.0762, "num_tokens": 2793258.0, "reward": -6.3829345703125, "reward_std": 0.7746678590774536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -16.765869140625, "rewards/ppl_reward/std": 29.22699546813965, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 149.640625, "completions/mean_terminated_length": 149.640625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.21443801401157478, "grad_norm": 2.0191195011138916, "kl": 0.33935546875, "learning_rate": 8.536585365853658e-06, "loss": 0.0114, "num_tokens": 2810355.0, "reward": -0.80419921875, "reward_std": 0.22346001863479614, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.6083984375, "rewards/ppl_reward/std": 3.291100263595581, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 137.75, "completions/mean_terminated_length": 137.75, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.21565641181845874, "grad_norm": 2.211615562438965, "kl": 0.38134765625, "learning_rate": 8.585365853658537e-06, "loss": -0.0766, "num_tokens": 2825627.0, "reward": -3.576904296875, "reward_std": 0.9321581125259399, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -11.15380859375, "rewards/ppl_reward/std": 4.623821258544922, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 161.1875, "completions/mean_terminated_length": 161.1875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.21687480962534267, "grad_norm": 2.053856134414673, "kl": 0.33740234375, "learning_rate": 8.634146341463415e-06, "loss": -0.0723, "num_tokens": 2843591.0, "reward": -0.922119140625, "reward_std": 0.2144746035337448, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.84423828125, "rewards/ppl_reward/std": 2.527523994445801, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 152.625, "completions/mean_terminated_length": 152.625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.21809320743222663, "grad_norm": 1.9909114837646484, "kl": 0.3603515625, "learning_rate": 8.682926829268294e-06, "loss": 0.0188, "num_tokens": 2860423.0, "reward": -0.7130126953125, "reward_std": 0.258717805147171, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.386962890625, "rewards/ppl_reward/std": 1.8759479522705078, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 171.703125, "completions/mean_terminated_length": 171.703125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.21931160523911056, "grad_norm": 1.9899379014968872, "kl": 0.35595703125, "learning_rate": 8.731707317073171e-06, "loss": -0.0527, "num_tokens": 2878356.0, "reward": -1.3309326171875, "reward_std": 0.27312466502189636, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.661865234375, "rewards/ppl_reward/std": 4.3953022956848145, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 163.171875, "completions/mean_terminated_length": 163.171875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.22053000304599452, "grad_norm": 2.0005650520324707, "kl": 0.40283203125, "learning_rate": 8.78048780487805e-06, "loss": -0.0264, "num_tokens": 2895183.0, "reward": -0.8099365234375, "reward_std": 0.3032469153404236, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.580810546875, "rewards/ppl_reward/std": 4.234728813171387, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 216.265625, "completions/mean_terminated_length": 216.265625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.22174840085287847, "grad_norm": 1.7178394794464111, "kl": 0.3134765625, "learning_rate": 8.829268292682927e-06, "loss": -0.0098, "num_tokens": 2916320.0, "reward": -1.1197509765625, "reward_std": 0.2364557385444641, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.200439453125, "rewards/ppl_reward/std": 3.3447020053863525, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 225.53125, "completions/mean_terminated_length": 225.53125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2229667986597624, "grad_norm": 1.7417360544204712, "kl": 0.33935546875, "learning_rate": 8.878048780487806e-06, "loss": 0.0109, "num_tokens": 2937314.0, "reward": -6.3450927734375, "reward_std": 3.2580838203430176, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -16.651123046875, "rewards/ppl_reward/std": 30.372535705566406, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 249.828125, "completions/mean_terminated_length": 249.828125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.22418519646664636, "grad_norm": 1.8500088453292847, "kl": 0.29296875, "learning_rate": 8.926829268292683e-06, "loss": 0.0168, "num_tokens": 2960567.0, "reward": -2.042236328125, "reward_std": 0.2842506170272827, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.08447265625, "rewards/ppl_reward/std": 2.48500394821167, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 233.21875, "completions/mean_terminated_length": 233.21875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.2254035942735303, "grad_norm": 1.8068212270736694, "kl": 0.3154296875, "learning_rate": 8.975609756097562e-06, "loss": -0.049, "num_tokens": 2982293.0, "reward": -0.15625, "reward_std": 0.2430024892091751, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.2265625, "rewards/ppl_reward/std": 2.4834132194519043, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 244.734375, "completions/mean_terminated_length": 244.734375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.22662199208041425, "grad_norm": 1.7277573347091675, "kl": 0.28564453125, "learning_rate": 9.02439024390244e-06, "loss": 0.02, "num_tokens": 3005156.0, "reward": -2.3642578125, "reward_std": 0.2715250253677368, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.728515625, "rewards/ppl_reward/std": 5.199014186859131, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 229.5625, "completions/mean_terminated_length": 229.5625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2278403898872982, "grad_norm": 1.8236923217773438, "kl": 0.3125, "learning_rate": 9.073170731707319e-06, "loss": -0.0565, "num_tokens": 3026608.0, "reward": -1.204833984375, "reward_std": 0.6229165196418762, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.33154296875, "rewards/ppl_reward/std": 2.393322467803955, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 253.703125, "completions/mean_terminated_length": 253.703125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.22905878769418214, "grad_norm": 1.6870782375335693, "kl": 0.294921875, "learning_rate": 9.121951219512196e-06, "loss": -0.1221, "num_tokens": 3049581.0, "reward": -0.868896484375, "reward_std": 0.18499739468097687, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.73779296875, "rewards/ppl_reward/std": 3.3268089294433594, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 252.5625, "completions/mean_terminated_length": 252.5625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.2302771855010661, "grad_norm": 1.7175002098083496, "kl": 0.2978515625, "learning_rate": 9.170731707317075e-06, "loss": 0.0134, "num_tokens": 3072353.0, "reward": -0.433837890625, "reward_std": 0.5563007593154907, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.77392578125, "rewards/ppl_reward/std": 2.474109649658203, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 332.15625, "completions/mean_terminated_length": 298.1311340332031, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.23149558330795006, "grad_norm": 3.8090860843658447, "kl": 0.26171875, "learning_rate": 9.219512195121952e-06, "loss": 0.039, "num_tokens": 3101115.0, "reward": -2.494140625, "reward_std": 0.9066250324249268, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -8.40234375, "rewards/ppl_reward/std": 7.4515509605407715, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.25341787934303284, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 271.765625, "completions/mean_terminated_length": 271.765625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.232713981114834, "grad_norm": 1.7193796634674072, "kl": 0.287353515625, "learning_rate": 9.268292682926831e-06, "loss": -0.125, "num_tokens": 3125628.0, "reward": -1.6849365234375, "reward_std": 0.36807507276535034, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.338623046875, "rewards/ppl_reward/std": 4.991385459899902, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 301.96875, "completions/mean_terminated_length": 301.96875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.23393237892171795, "grad_norm": 1.49349045753479, "kl": 0.28271484375, "learning_rate": 9.317073170731709e-06, "loss": -0.0176, "num_tokens": 3151962.0, "reward": -1.1678466796875, "reward_std": 0.22430121898651123, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.296630859375, "rewards/ppl_reward/std": 5.039465427398682, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 302.96875, "completions/mean_terminated_length": 302.96875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.23515077672860188, "grad_norm": 1.3841097354888916, "kl": 0.2744140625, "learning_rate": 9.365853658536586e-06, "loss": 0.0137, "num_tokens": 3179184.0, "reward": -0.31671142578125, "reward_std": 0.190385103225708, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.6334228515625, "rewards/ppl_reward/std": 1.955211877822876, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 380.015625, "completions/mean_terminated_length": 369.7936706542969, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.23636917453548584, "grad_norm": 1.4631717205047607, "kl": 0.240234375, "learning_rate": 9.414634146341463e-06, "loss": 0.1173, "num_tokens": 3211905.0, "reward": -0.87603759765625, "reward_std": 0.26001498103141785, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.6817626953125, "rewards/ppl_reward/std": 4.918431758880615, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 328.21875, "completions/mean_terminated_length": 317.17462158203125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.2375875723423698, "grad_norm": 1.4767104387283325, "kl": 0.25341796875, "learning_rate": 9.463414634146342e-06, "loss": 0.065, "num_tokens": 3239807.0, "reward": -1.5384521484375, "reward_std": 0.6992493271827698, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.983154296875, "rewards/ppl_reward/std": 5.53750467300415, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 317.078125, "completions/mean_terminated_length": 317.078125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.23880597014925373, "grad_norm": 1.4141061305999756, "kl": 0.26904296875, "learning_rate": 9.51219512195122e-06, "loss": -0.0156, "num_tokens": 3266428.0, "reward": -1.4107666015625, "reward_std": 0.3654784560203552, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.821533203125, "rewards/ppl_reward/std": 4.34868049621582, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 292.84375, "completions/mean_terminated_length": 292.84375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.24002436795613769, "grad_norm": 1.715597152709961, "kl": 0.2919921875, "learning_rate": 9.560975609756098e-06, "loss": 0.067, "num_tokens": 3292458.0, "reward": -0.942138671875, "reward_std": 0.2143101543188095, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.88427734375, "rewards/ppl_reward/std": 2.328453302383423, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 262.890625, "completions/mean_terminated_length": 262.890625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.24124276576302162, "grad_norm": 1.6462796926498413, "kl": 0.280029296875, "learning_rate": 9.609756097560976e-06, "loss": 0.0337, "num_tokens": 3316339.0, "reward": -1.0308837890625, "reward_std": 0.32698380947113037, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.061767578125, "rewards/ppl_reward/std": 4.196681022644043, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 219.765625, "completions/mean_terminated_length": 219.765625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.24246116356990557, "grad_norm": 1.768150806427002, "kl": 0.30322265625, "learning_rate": 9.658536585365855e-06, "loss": 0.0076, "num_tokens": 3337676.0, "reward": -1.5416259765625, "reward_std": 0.3384567201137543, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.083251953125, "rewards/ppl_reward/std": 4.291332721710205, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 212.015625, "completions/mean_terminated_length": 212.015625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.24367956137678953, "grad_norm": 1.7163819074630737, "kl": 0.30859375, "learning_rate": 9.707317073170732e-06, "loss": 0.0506, "num_tokens": 3357813.0, "reward": -2.043701171875, "reward_std": 0.4171565771102905, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.97802734375, "rewards/ppl_reward/std": 2.354487180709839, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 220.40625, "completions/mean_terminated_length": 220.40625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.24489795918367346, "grad_norm": 1.636833667755127, "kl": 0.30712890625, "learning_rate": 9.756097560975611e-06, "loss": 0.0219, "num_tokens": 3379167.0, "reward": -1.0076904296875, "reward_std": 1.050585150718689, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.968505859375, "rewards/ppl_reward/std": 6.78851842880249, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 176.265625, "completions/mean_terminated_length": 176.265625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.24611635699055742, "grad_norm": 1.7583590745925903, "kl": 0.31396484375, "learning_rate": 9.804878048780488e-06, "loss": -0.0349, "num_tokens": 3397464.0, "reward": -1.086181640625, "reward_std": 0.2298690527677536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.17236328125, "rewards/ppl_reward/std": 2.6020243167877197, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 171.265625, "completions/mean_terminated_length": 171.265625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.24733475479744135, "grad_norm": 1.8064614534378052, "kl": 0.34765625, "learning_rate": 9.853658536585367e-06, "loss": 0.0158, "num_tokens": 3415177.0, "reward": -0.3280029296875, "reward_std": 0.17677925527095795, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -4.624755859375, "rewards/ppl_reward/std": 2.737375497817993, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 153.21875, "completions/mean_terminated_length": 153.21875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.2485531526043253, "grad_norm": 1.8876389265060425, "kl": 0.353515625, "learning_rate": 9.902439024390245e-06, "loss": -0.0659, "num_tokens": 3431903.0, "reward": -2.940673828125, "reward_std": 0.6337758302688599, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -9.78759765625, "rewards/ppl_reward/std": 5.076391696929932, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08768405020236969, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 167.3125, "completions/mean_terminated_length": 153.71429443359375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.24977155041120927, "grad_norm": 2.0623466968536377, "kl": 0.326171875, "learning_rate": 9.951219512195124e-06, "loss": 0.235, "num_tokens": 3449579.0, "reward": -0.9693603515625, "reward_std": 0.3341052532196045, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.829345703125, "rewards/ppl_reward/std": 3.390671730041504, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 162.90625, "completions/mean_terminated_length": 162.90625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.25098994821809323, "grad_norm": 1.9773200750350952, "kl": 0.33349609375, "learning_rate": 1e-05, "loss": 0.0239, "num_tokens": 3467725.0, "reward": -1.99853515625, "reward_std": 0.3624621629714966, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.9501953125, "rewards/ppl_reward/std": 7.386921405792236, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 157.0625, "completions/mean_terminated_length": 157.0625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.25220834602497716, "grad_norm": 1.8105636835098267, "kl": 0.333984375, "learning_rate": 1.0048780487804878e-05, "loss": 0.0075, "num_tokens": 3485609.0, "reward": -1.303466796875, "reward_std": 0.316351056098938, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.53662109375, "rewards/ppl_reward/std": 2.2769126892089844, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 149.390625, "completions/mean_terminated_length": 149.390625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.2534267438318611, "grad_norm": 2.0761802196502686, "kl": 0.37060546875, "learning_rate": 1.0097560975609757e-05, "loss": 0.0207, "num_tokens": 3501714.0, "reward": -0.8316650390625, "reward_std": 0.2830313444137573, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.530517578125, "rewards/ppl_reward/std": 3.3832647800445557, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09241779148578644, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 161.109375, "completions/mean_terminated_length": 161.109375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.2546451416387451, "grad_norm": 1.9001517295837402, "kl": 0.359375, "learning_rate": 1.0146341463414634e-05, "loss": -0.0454, "num_tokens": 3518617.0, "reward": -2.1409912109375, "reward_std": 0.6127506494522095, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -8.180419921875, "rewards/ppl_reward/std": 4.2253570556640625, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1118449866771698, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 153.1875, "completions/mean_terminated_length": 153.1875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.255863539445629, "grad_norm": 1.7644462585449219, "kl": 0.34326171875, "learning_rate": 1.0195121951219513e-05, "loss": -0.0984, "num_tokens": 3535389.0, "reward": -2.5869140625, "reward_std": 0.48909980058670044, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -9.142578125, "rewards/ppl_reward/std": 5.261679172515869, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 143.71875, "completions/mean_terminated_length": 143.71875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.25708193725251294, "grad_norm": 2.0198538303375244, "kl": 0.3662109375, "learning_rate": 1.024390243902439e-05, "loss": 0.0831, "num_tokens": 3551419.0, "reward": -2.4903564453125, "reward_std": 0.3398621082305908, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -8.941650390625, "rewards/ppl_reward/std": 7.831767559051514, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 170.828125, "completions/mean_terminated_length": 170.828125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.25830033505939687, "grad_norm": 1.9317320585250854, "kl": 0.3427734375, "learning_rate": 1.029268292682927e-05, "loss": 0.038, "num_tokens": 3570104.0, "reward": -2.14697265625, "reward_std": 0.35207051038742065, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -8.2392578125, "rewards/ppl_reward/std": 9.423612594604492, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 169.078125, "completions/mean_terminated_length": 169.078125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.25951873286628085, "grad_norm": 1.8753331899642944, "kl": 0.3857421875, "learning_rate": 1.0341463414634147e-05, "loss": -0.013, "num_tokens": 3587677.0, "reward": -0.82415771484375, "reward_std": 0.3344525098800659, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.6092529296875, "rewards/ppl_reward/std": 3.6789462566375732, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 181.71875, "completions/mean_terminated_length": 181.71875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.2607371306731648, "grad_norm": 1.965737223625183, "kl": 0.36474609375, "learning_rate": 1.0390243902439026e-05, "loss": 0.0436, "num_tokens": 3606243.0, "reward": -1.322998046875, "reward_std": 0.3285561501979828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.64599609375, "rewards/ppl_reward/std": 2.9137039184570312, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.2619555284800487, "grad_norm": 1.8424474000930786, "kl": 0.3984375, "learning_rate": 1.0439024390243903e-05, "loss": 0.0858, "num_tokens": 3624099.0, "reward": -5.023193359375, "reward_std": 0.8658571243286133, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -14.03076171875, "rewards/ppl_reward/std": 18.971391677856445, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 186.109375, "completions/mean_terminated_length": 172.80953979492188, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.2631739262869327, "grad_norm": 1.9560121297836304, "kl": 0.359375, "learning_rate": 1.0487804878048782e-05, "loss": 0.1557, "num_tokens": 3643378.0, "reward": -2.0126953125, "reward_std": 0.7138792276382446, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.970703125, "rewards/ppl_reward/std": 4.42836332321167, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 138.828125, "completions/mean_terminated_length": 138.828125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.26439232409381663, "grad_norm": 1.969612956047058, "kl": 0.43505859375, "learning_rate": 1.053658536585366e-05, "loss": -0.0541, "num_tokens": 3658759.0, "reward": -0.4173583984375, "reward_std": 0.2254246473312378, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.834716796875, "rewards/ppl_reward/std": 2.5502026081085205, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.26561072190070056, "grad_norm": 2.1832220554351807, "kl": 0.4384765625, "learning_rate": 1.0585365853658538e-05, "loss": 0.0253, "num_tokens": 3676087.0, "reward": -0.50341796875, "reward_std": 0.3079666495323181, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.9287109375, "rewards/ppl_reward/std": 3.084188938140869, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.26682911970758455, "grad_norm": 1.7568196058273315, "kl": 0.3671875, "learning_rate": 1.0634146341463416e-05, "loss": 0.0176, "num_tokens": 3694559.0, "reward": -1.94384765625, "reward_std": 0.43785521388053894, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.8408203125, "rewards/ppl_reward/std": 4.657278537750244, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 177.046875, "completions/mean_terminated_length": 177.046875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.2680475175144685, "grad_norm": 1.7915352582931519, "kl": 0.3955078125, "learning_rate": 1.0682926829268295e-05, "loss": -0.0448, "num_tokens": 3712690.0, "reward": -1.544189453125, "reward_std": 0.6019219756126404, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.08837890625, "rewards/ppl_reward/std": 2.9250216484069824, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 165.203125, "completions/mean_terminated_length": 165.203125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.2692659153213524, "grad_norm": 1.9536157846450806, "kl": 0.41015625, "learning_rate": 1.0731707317073172e-05, "loss": 0.0374, "num_tokens": 3729935.0, "reward": -0.442626953125, "reward_std": 0.27641761302948, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.73681640625, "rewards/ppl_reward/std": 2.966219425201416, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 194.34375, "completions/mean_terminated_length": 194.34375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.27048431312823634, "grad_norm": 2.003105878829956, "kl": 0.39697265625, "learning_rate": 1.0780487804878051e-05, "loss": 0.0804, "num_tokens": 3750333.0, "reward": -0.5458984375, "reward_std": 0.3454636335372925, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.935546875, "rewards/ppl_reward/std": 2.223686456680298, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 195.46875, "completions/mean_terminated_length": 195.46875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.27170271093512033, "grad_norm": 1.741651177406311, "kl": 0.35546875, "learning_rate": 1.0829268292682928e-05, "loss": 0.0073, "num_tokens": 3769859.0, "reward": -3.3382568359375, "reward_std": 0.7457098960876465, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -10.410888671875, "rewards/ppl_reward/std": 12.52772045135498, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 188.109375, "completions/mean_terminated_length": 188.109375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.27292110874200426, "grad_norm": 1.8694076538085938, "kl": 0.376953125, "learning_rate": 1.0878048780487807e-05, "loss": -0.0701, "num_tokens": 3788394.0, "reward": -1.794921875, "reward_std": 0.7218828797340393, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.54296875, "rewards/ppl_reward/std": 3.6753547191619873, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 184.359375, "completions/mean_terminated_length": 184.359375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.2741395065488882, "grad_norm": 1.7742443084716797, "kl": 0.3759765625, "learning_rate": 1.0926829268292685e-05, "loss": -0.0145, "num_tokens": 3807241.0, "reward": -2.27734375, "reward_std": 0.6296431422233582, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.359375, "rewards/ppl_reward/std": 6.011316299438477, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1249379813671112, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 177.8125, "completions/mean_terminated_length": 177.8125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.2753579043557722, "grad_norm": 1.953503131866455, "kl": 0.36865234375, "learning_rate": 1.0975609756097562e-05, "loss": -0.0498, "num_tokens": 3825389.0, "reward": -1.5631103515625, "reward_std": 0.3095080256462097, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -7.048095703125, "rewards/ppl_reward/std": 4.775688171386719, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 219.203125, "completions/mean_terminated_length": 219.203125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.2765763021626561, "grad_norm": 1.5605069398880005, "kl": 0.33642578125, "learning_rate": 1.1024390243902441e-05, "loss": -0.0116, "num_tokens": 3847234.0, "reward": -3.325439453125, "reward_std": 0.6441425681114197, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -10.61181640625, "rewards/ppl_reward/std": 13.837275505065918, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 205.28125, "completions/mean_terminated_length": 205.28125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.27779469996954004, "grad_norm": 1.5863921642303467, "kl": 0.38720703125, "learning_rate": 1.1073170731707318e-05, "loss": -0.1013, "num_tokens": 3866428.0, "reward": -1.472412109375, "reward_std": 0.49977362155914307, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.94482421875, "rewards/ppl_reward/std": 3.2974023818969727, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 249.15625, "completions/mean_terminated_length": 249.15625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.279013097776424, "grad_norm": 1.58004891872406, "kl": 0.36083984375, "learning_rate": 1.1121951219512197e-05, "loss": -0.0577, "num_tokens": 3889926.0, "reward": -1.0050048828125, "reward_std": 0.3550030589103699, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.978759765625, "rewards/ppl_reward/std": 3.7723708152770996, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 312.78125, "completions/mean_terminated_length": 312.78125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.28023149558330795, "grad_norm": 1.5883455276489258, "kl": 0.3447265625, "learning_rate": 1.1170731707317074e-05, "loss": 0.0409, "num_tokens": 3916736.0, "reward": -1.306640625, "reward_std": 0.3406689763069153, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.48046875, "rewards/ppl_reward/std": 4.984902381896973, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 328.71875, "completions/mean_terminated_length": 328.71875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.2814498933901919, "grad_norm": 1.3339654207229614, "kl": 0.33251953125, "learning_rate": 1.1219512195121953e-05, "loss": 0.0451, "num_tokens": 3945142.0, "reward": -0.4708251953125, "reward_std": 0.27398574352264404, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.855712890625, "rewards/ppl_reward/std": 1.8199886083602905, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 340.390625, "completions/mean_terminated_length": 340.390625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.28266829119707587, "grad_norm": 1.452980637550354, "kl": 0.36328125, "learning_rate": 1.1268292682926829e-05, "loss": 0.0116, "num_tokens": 3973199.0, "reward": -2.49365234375, "reward_std": 0.5333764553070068, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -8.8388671875, "rewards/ppl_reward/std": 5.134278774261475, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 401.671875, "completions/mean_terminated_length": 391.7936706542969, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.2838866890039598, "grad_norm": 1.3195852041244507, "kl": 0.32421875, "learning_rate": 1.1317073170731706e-05, "loss": 0.0221, "num_tokens": 4006130.0, "reward": -3.34405517578125, "reward_std": 0.7311201095581055, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -10.6021728515625, "rewards/ppl_reward/std": 5.459533214569092, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 409.90625, "completions/mean_terminated_length": 409.90625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.28510508681084373, "grad_norm": 1.2035330533981323, "kl": 0.32763671875, "learning_rate": 1.1365853658536585e-05, "loss": -0.0339, "num_tokens": 4039004.0, "reward": -0.83905029296875, "reward_std": 0.40677592158317566, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.4984130859375, "rewards/ppl_reward/std": 2.6161837577819824, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 379.828125, "completions/mean_terminated_length": 369.60321044921875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.28632348461772766, "grad_norm": 1.4087101221084595, "kl": 0.330078125, "learning_rate": 1.1414634146341463e-05, "loss": 0.0445, "num_tokens": 4070305.0, "reward": -1.2713623046875, "reward_std": 0.31071004271507263, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.527099609375, "rewards/ppl_reward/std": 3.220541477203369, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 361.015625, "completions/mean_terminated_length": 350.4920959472656, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.28754188242461165, "grad_norm": 1.3708683252334595, "kl": 0.32958984375, "learning_rate": 1.1463414634146342e-05, "loss": -0.0628, "num_tokens": 4100658.0, "reward": -1.0023193359375, "reward_std": 0.25044769048690796, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.949951171875, "rewards/ppl_reward/std": 1.9360196590423584, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 417.625, "completions/mean_terminated_length": 398.06451416015625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.2887602802314956, "grad_norm": 1.1654398441314697, "kl": 0.3212890625, "learning_rate": 1.1512195121951219e-05, "loss": 0.0525, "num_tokens": 4134394.0, "reward": -1.895263671875, "reward_std": 0.39265385270118713, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.68115234375, "rewards/ppl_reward/std": 4.779569625854492, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 318.65625, "completions/mean_terminated_length": 318.65625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2899786780383795, "grad_norm": 1.446476936340332, "kl": 0.33984375, "learning_rate": 1.1560975609756098e-05, "loss": 0.0566, "num_tokens": 4161684.0, "reward": -1.3267822265625, "reward_std": 0.49832555651664734, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.606689453125, "rewards/ppl_reward/std": 4.56362771987915, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 286.515625, "completions/mean_terminated_length": 286.515625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.2911970758452635, "grad_norm": 1.4246041774749756, "kl": 0.34619140625, "learning_rate": 1.1609756097560975e-05, "loss": -0.0423, "num_tokens": 4187237.0, "reward": -0.77276611328125, "reward_std": 0.25452348589897156, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.5064697265625, "rewards/ppl_reward/std": 3.7490546703338623, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 256.6875, "completions/mean_terminated_length": 256.6875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.29241547365214743, "grad_norm": 1.4700735807418823, "kl": 0.35205078125, "learning_rate": 1.1658536585365854e-05, "loss": 0.0217, "num_tokens": 4210153.0, "reward": -0.19580078125, "reward_std": 0.21538181602954865, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.3291015625, "rewards/ppl_reward/std": 2.1551458835601807, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 256.859375, "completions/mean_terminated_length": 256.859375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.29363387145903136, "grad_norm": 1.638140082359314, "kl": 0.36474609375, "learning_rate": 1.1707317073170731e-05, "loss": 0.0372, "num_tokens": 4233992.0, "reward": -1.110107421875, "reward_std": 0.2223253846168518, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.22021484375, "rewards/ppl_reward/std": 2.6059765815734863, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 220.640625, "completions/mean_terminated_length": 220.640625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.29485226926591535, "grad_norm": 1.586604356765747, "kl": 0.37646484375, "learning_rate": 1.175609756097561e-05, "loss": 0.0268, "num_tokens": 4255113.0, "reward": -1.2459716796875, "reward_std": 0.5453722476959229, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.445068359375, "rewards/ppl_reward/std": 5.98010778427124, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 200.765625, "completions/mean_terminated_length": 200.765625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.2960706670727993, "grad_norm": 1.6280089616775513, "kl": 0.384765625, "learning_rate": 1.1804878048780488e-05, "loss": -0.0608, "num_tokens": 4274650.0, "reward": -3.4727783203125, "reward_std": 0.6116736531257629, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -10.945556640625, "rewards/ppl_reward/std": 7.279670715332031, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 181.96875, "completions/mean_terminated_length": 181.96875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.2972890648796832, "grad_norm": 1.8493640422821045, "kl": 0.37060546875, "learning_rate": 1.1853658536585367e-05, "loss": 0.0388, "num_tokens": 4294056.0, "reward": -0.4893798828125, "reward_std": 0.2636960744857788, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.908447265625, "rewards/ppl_reward/std": 1.8359839916229248, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 178.015625, "completions/mean_terminated_length": 178.015625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.29850746268656714, "grad_norm": 1.8716539144515991, "kl": 0.408203125, "learning_rate": 1.1902439024390244e-05, "loss": 0.0607, "num_tokens": 4312473.0, "reward": -1.168212890625, "reward_std": 0.2937960624694824, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.29736328125, "rewards/ppl_reward/std": 2.5351815223693848, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 149.984375, "completions/mean_terminated_length": 149.984375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.2997258604934511, "grad_norm": 1.8027973175048828, "kl": 0.41748046875, "learning_rate": 1.1951219512195123e-05, "loss": -0.0095, "num_tokens": 4329024.0, "reward": -2.3851318359375, "reward_std": 0.5056092739105225, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -8.692138671875, "rewards/ppl_reward/std": 5.611582279205322, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 162.265625, "completions/mean_terminated_length": 162.265625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.30094425830033505, "grad_norm": 1.625916838645935, "kl": 0.4013671875, "learning_rate": 1.2e-05, "loss": -0.0045, "num_tokens": 4346305.0, "reward": -2.5079345703125, "reward_std": 0.5759865045547485, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -8.945556640625, "rewards/ppl_reward/std": 7.510408401489258, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 148.96875, "completions/mean_terminated_length": 148.96875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.302162656107219, "grad_norm": 2.216472625732422, "kl": 0.47216796875, "learning_rate": 1.204878048780488e-05, "loss": 0.0258, "num_tokens": 4363295.0, "reward": -2.69140625, "reward_std": 0.446439266204834, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -9.3515625, "rewards/ppl_reward/std": 7.785342216491699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 129.4375, "completions/mean_terminated_length": 129.4375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.30338105391410297, "grad_norm": 1.9172817468643188, "kl": 0.46875, "learning_rate": 1.2097560975609757e-05, "loss": 0.0004, "num_tokens": 4378875.0, "reward": -0.6737060546875, "reward_std": 0.24730117619037628, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.316162109375, "rewards/ppl_reward/std": 1.8170627355575562, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 120.96875, "completions/mean_terminated_length": 120.96875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.3045994517209869, "grad_norm": 2.1277098655700684, "kl": 0.5166015625, "learning_rate": 1.2146341463414636e-05, "loss": -0.0268, "num_tokens": 4393713.0, "reward": -1.4542236328125, "reward_std": 0.347042977809906, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.838134765625, "rewards/ppl_reward/std": 6.461254119873047, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 121.59375, "completions/mean_terminated_length": 121.59375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.30581784952787083, "grad_norm": 2.046104907989502, "kl": 0.5615234375, "learning_rate": 1.2195121951219513e-05, "loss": -0.0228, "num_tokens": 4408903.0, "reward": -3.4755859375, "reward_std": 0.5228626728057861, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -10.888671875, "rewards/ppl_reward/std": 16.5909423828125, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 133.515625, "completions/mean_terminated_length": 133.515625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3070362473347548, "grad_norm": 2.132580518722534, "kl": 0.484375, "learning_rate": 1.2243902439024392e-05, "loss": -0.0253, "num_tokens": 4425856.0, "reward": -1.02001953125, "reward_std": 0.2623867392539978, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.0087890625, "rewards/ppl_reward/std": 2.19537615776062, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 121.84375, "completions/mean_terminated_length": 121.84375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.30825464514163875, "grad_norm": 2.0615572929382324, "kl": 0.53515625, "learning_rate": 1.2292682926829269e-05, "loss": 0.0105, "num_tokens": 4440862.0, "reward": -0.9569091796875, "reward_std": 0.39431533217430115, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.812255859375, "rewards/ppl_reward/std": 1.9165282249450684, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 122.375, "completions/mean_terminated_length": 122.375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.3094730429485227, "grad_norm": 2.10693359375, "kl": 0.5478515625, "learning_rate": 1.2341463414634146e-05, "loss": -0.0394, "num_tokens": 4455950.0, "reward": -7.142578125, "reward_std": 1.1331703662872314, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -18.28515625, "rewards/ppl_reward/std": 29.814184188842773, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 127.90625, "completions/mean_terminated_length": 127.90625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.31069144075540667, "grad_norm": 2.0175507068634033, "kl": 0.52685546875, "learning_rate": 1.2390243902439025e-05, "loss": -0.0559, "num_tokens": 4471224.0, "reward": -0.36279296875, "reward_std": 0.3098941445350647, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.7255859375, "rewards/ppl_reward/std": 2.737037181854248, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 133.078125, "completions/mean_terminated_length": 133.078125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.3119098385622906, "grad_norm": 3.13507080078125, "kl": 0.595703125, "learning_rate": 1.2439024390243903e-05, "loss": 0.0093, "num_tokens": 4486741.0, "reward": -2.0106201171875, "reward_std": 0.24198634922504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.021240234375, "rewards/ppl_reward/std": 6.252007007598877, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 157.453125, "completions/mean_terminated_length": 157.453125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.31312823636917453, "grad_norm": 1.8394560813903809, "kl": 0.53515625, "learning_rate": 1.2487804878048782e-05, "loss": 0.0745, "num_tokens": 4503794.0, "reward": -1.724853515625, "reward_std": 0.41535684466362, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.41064453125, "rewards/ppl_reward/std": 3.8361029624938965, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 150.96875, "completions/mean_terminated_length": 150.96875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.31434663417605846, "grad_norm": 1.6439635753631592, "kl": 0.49609375, "learning_rate": 1.2536585365853659e-05, "loss": -0.0625, "num_tokens": 4521520.0, "reward": -0.939453125, "reward_std": 0.17356139421463013, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.87890625, "rewards/ppl_reward/std": 2.8355746269226074, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 166.515625, "completions/mean_terminated_length": 166.515625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.31556503198294245, "grad_norm": 1.69520103931427, "kl": 0.505859375, "learning_rate": 1.2585365853658538e-05, "loss": 0.0069, "num_tokens": 4538985.0, "reward": -0.88818359375, "reward_std": 0.21577967703342438, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.7763671875, "rewards/ppl_reward/std": 2.0575599670410156, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 171.4375, "completions/mean_terminated_length": 171.4375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.3167834297898264, "grad_norm": 1.7886601686477661, "kl": 0.5341796875, "learning_rate": 1.2634146341463415e-05, "loss": 0.0024, "num_tokens": 4556661.0, "reward": -0.9967041015625, "reward_std": 0.2810119390487671, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.993408203125, "rewards/ppl_reward/std": 1.5389314889907837, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 181.3125, "completions/mean_terminated_length": 181.3125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.3180018275967103, "grad_norm": 1.8752790689468384, "kl": 0.49609375, "learning_rate": 1.2682926829268294e-05, "loss": 0.051, "num_tokens": 4574577.0, "reward": -1.3009033203125, "reward_std": 0.2055133581161499, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.601806640625, "rewards/ppl_reward/std": 3.3065154552459717, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 190.671875, "completions/mean_terminated_length": 190.671875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.3192202254035943, "grad_norm": 1.7533481121063232, "kl": 0.49755859375, "learning_rate": 1.2731707317073172e-05, "loss": -0.0729, "num_tokens": 4592876.0, "reward": -3.2261962890625, "reward_std": 0.45934146642684937, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -10.452392578125, "rewards/ppl_reward/std": 5.503728866577148, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 194.625, "completions/mean_terminated_length": 194.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.3204386232104782, "grad_norm": 1.6567347049713135, "kl": 0.5126953125, "learning_rate": 1.278048780487805e-05, "loss": -0.0612, "num_tokens": 4611780.0, "reward": -2.424560546875, "reward_std": 0.35320401191711426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.84912109375, "rewards/ppl_reward/std": 9.417085647583008, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 221.953125, "completions/mean_terminated_length": 221.953125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.32165702101736215, "grad_norm": 1.5125033855438232, "kl": 0.4755859375, "learning_rate": 1.2829268292682928e-05, "loss": -0.0592, "num_tokens": 4633441.0, "reward": -0.493896484375, "reward_std": 0.14521394670009613, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.98779296875, "rewards/ppl_reward/std": 1.457244634628296, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 239.953125, "completions/mean_terminated_length": 239.953125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.32287541882424614, "grad_norm": 1.4808499813079834, "kl": 0.4404296875, "learning_rate": 1.2878048780487807e-05, "loss": 0.0385, "num_tokens": 4656574.0, "reward": -0.67919921875, "reward_std": 0.47557532787323, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.3193359375, "rewards/ppl_reward/std": 3.119722604751587, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 252.9375, "completions/mean_terminated_length": 252.9375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.32409381663113007, "grad_norm": 1.5543625354766846, "kl": 0.4287109375, "learning_rate": 1.2926829268292684e-05, "loss": 0.0905, "num_tokens": 4679914.0, "reward": -0.9256591796875, "reward_std": 0.23691648244857788, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.851318359375, "rewards/ppl_reward/std": 2.112419843673706, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 240.265625, "completions/mean_terminated_length": 240.265625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.325312214438014, "grad_norm": 1.7562434673309326, "kl": 0.48046875, "learning_rate": 1.2975609756097563e-05, "loss": 0.1179, "num_tokens": 4702563.0, "reward": -1.640380859375, "reward_std": 0.23189570009708405, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -7.21826171875, "rewards/ppl_reward/std": 4.924527168273926, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 274.34375, "completions/mean_terminated_length": 274.34375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.32653061224489793, "grad_norm": 1.4856659173965454, "kl": 0.42529296875, "learning_rate": 1.302439024390244e-05, "loss": -0.0096, "num_tokens": 4726841.0, "reward": -0.47900390625, "reward_std": 0.1211778074502945, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.9580078125, "rewards/ppl_reward/std": 2.867584705352783, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 231.3125, "completions/mean_terminated_length": 231.3125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.3277490100517819, "grad_norm": 1.3981302976608276, "kl": 0.44775390625, "learning_rate": 1.307317073170732e-05, "loss": -0.0073, "num_tokens": 4748749.0, "reward": -2.0535888671875, "reward_std": 0.38868632912635803, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.107177734375, "rewards/ppl_reward/std": 5.5817484855651855, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 224.453125, "completions/mean_terminated_length": 224.453125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.32896740785866585, "grad_norm": 1.6012166738510132, "kl": 0.470703125, "learning_rate": 1.3121951219512197e-05, "loss": 0.0189, "num_tokens": 4770066.0, "reward": -0.822265625, "reward_std": 0.169223815202713, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.64453125, "rewards/ppl_reward/std": 1.735325574874878, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 219.5625, "completions/mean_terminated_length": 219.5625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.3301858056655498, "grad_norm": 1.5556213855743408, "kl": 0.4609375, "learning_rate": 1.3170731707317076e-05, "loss": -0.0006, "num_tokens": 4790918.0, "reward": -0.609375, "reward_std": 0.20339995622634888, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.140625, "rewards/ppl_reward/std": 2.26831316947937, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 212.109375, "completions/mean_terminated_length": 212.109375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.33140420347243377, "grad_norm": 1.5892163515090942, "kl": 0.490234375, "learning_rate": 1.3219512195121953e-05, "loss": 0.1247, "num_tokens": 4811621.0, "reward": -1.1007080078125, "reward_std": 0.2251867949962616, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.201416015625, "rewards/ppl_reward/std": 2.453601360321045, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 180.1875, "completions/mean_terminated_length": 180.1875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.3326226012793177, "grad_norm": 1.7593313455581665, "kl": 0.48388671875, "learning_rate": 1.326829268292683e-05, "loss": -0.0224, "num_tokens": 4830297.0, "reward": -1.6763916015625, "reward_std": 0.43098288774490356, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.352783203125, "rewards/ppl_reward/std": 8.276031494140625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 184.390625, "completions/mean_terminated_length": 184.390625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.33384099908620163, "grad_norm": 1.697869896888733, "kl": 0.4990234375, "learning_rate": 1.331707317073171e-05, "loss": 0.0912, "num_tokens": 4849074.0, "reward": -3.0386962890625, "reward_std": 1.4438027143478394, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -9.999267578125, "rewards/ppl_reward/std": 12.899811744689941, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 168.515625, "completions/mean_terminated_length": 168.515625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.3350593968930856, "grad_norm": 1.5932003259658813, "kl": 0.5546875, "learning_rate": 1.3365853658536587e-05, "loss": 0.0263, "num_tokens": 4866291.0, "reward": -1.647705078125, "reward_std": 0.2522047758102417, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.29541015625, "rewards/ppl_reward/std": 5.335022449493408, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 158.90625, "completions/mean_terminated_length": 158.90625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.33627779469996955, "grad_norm": 1.6455392837524414, "kl": 0.5029296875, "learning_rate": 1.3414634146341466e-05, "loss": -0.0387, "num_tokens": 4884405.0, "reward": -0.2987060546875, "reward_std": 0.17396055161952972, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.519287109375, "rewards/ppl_reward/std": 1.231968641281128, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 139.265625, "completions/mean_terminated_length": 139.265625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.3374961925068535, "grad_norm": 1.7723004817962646, "kl": 0.5654296875, "learning_rate": 1.3463414634146343e-05, "loss": 0.0168, "num_tokens": 4900398.0, "reward": -2.526123046875, "reward_std": 0.5694432258605957, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.90380859375, "rewards/ppl_reward/std": 6.262637615203857, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 128.171875, "completions/mean_terminated_length": 128.171875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.33871459031373746, "grad_norm": 1.885459303855896, "kl": 0.578125, "learning_rate": 1.3512195121951222e-05, "loss": 0.0044, "num_tokens": 4915041.0, "reward": -2.344482421875, "reward_std": 0.35725536942481995, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -8.65771484375, "rewards/ppl_reward/std": 5.747951507568359, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 119.6875, "completions/mean_terminated_length": 119.6875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3399329881206214, "grad_norm": 1.9069080352783203, "kl": 0.6240234375, "learning_rate": 1.3560975609756099e-05, "loss": 0.0456, "num_tokens": 4929109.0, "reward": -1.8653564453125, "reward_std": 0.43151891231536865, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.699462890625, "rewards/ppl_reward/std": 6.091916561126709, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 128.21875, "completions/mean_terminated_length": 128.21875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.3411513859275053, "grad_norm": 2.200749158859253, "kl": 0.59765625, "learning_rate": 1.3609756097560978e-05, "loss": 0.126, "num_tokens": 4943835.0, "reward": -1.080322265625, "reward_std": 0.41351211071014404, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.08251953125, "rewards/ppl_reward/std": 4.094695091247559, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 120.203125, "completions/mean_terminated_length": 120.203125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.34236978373438925, "grad_norm": 2.002089262008667, "kl": 0.5908203125, "learning_rate": 1.3658536585365855e-05, "loss": 0.0274, "num_tokens": 4958096.0, "reward": -10.348876953125, "reward_std": 1.3970500230789185, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -24.69775390625, "rewards/ppl_reward/std": 49.498939514160156, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 124.171875, "completions/mean_terminated_length": 124.171875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.34358818154127324, "grad_norm": 1.8398733139038086, "kl": 0.5576171875, "learning_rate": 1.3707317073170734e-05, "loss": -0.0066, "num_tokens": 4972723.0, "reward": -1.9385986328125, "reward_std": 0.35955876111984253, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.877197265625, "rewards/ppl_reward/std": 4.246856689453125, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 133.703125, "completions/mean_terminated_length": 133.703125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.34480657934815717, "grad_norm": 1.844071626663208, "kl": 0.5849609375, "learning_rate": 1.375609756097561e-05, "loss": 0.0058, "num_tokens": 4988776.0, "reward": -1.27978515625, "reward_std": 0.2710605263710022, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.4892578125, "rewards/ppl_reward/std": 3.1190764904022217, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 118.46875, "completions/mean_terminated_length": 118.46875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.3460249771550411, "grad_norm": 1.8747990131378174, "kl": 0.572265625, "learning_rate": 1.3804878048780487e-05, "loss": 0.0329, "num_tokens": 5002774.0, "reward": -1.4085693359375, "reward_std": 0.38115090131759644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.817138671875, "rewards/ppl_reward/std": 4.371301174163818, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 131.546875, "completions/mean_terminated_length": 131.546875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.3472433749619251, "grad_norm": 2.1475632190704346, "kl": 0.5927734375, "learning_rate": 1.3853658536585366e-05, "loss": -0.0355, "num_tokens": 5018321.0, "reward": -0.8125, "reward_std": 0.3422420918941498, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.5078125, "rewards/ppl_reward/std": 2.142862319946289, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 119.0, "completions/mean_terminated_length": 119.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.348461772768809, "grad_norm": 2.0270214080810547, "kl": 0.60546875, "learning_rate": 1.3902439024390244e-05, "loss": -0.0087, "num_tokens": 5032761.0, "reward": -1.628173828125, "reward_std": 0.7366716265678406, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -7.18603515625, "rewards/ppl_reward/std": 4.637361526489258, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 138.46875, "completions/mean_terminated_length": 138.46875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.34968017057569295, "grad_norm": 1.9284471273422241, "kl": 0.56884765625, "learning_rate": 1.3951219512195122e-05, "loss": 0.0208, "num_tokens": 5049175.0, "reward": -0.970947265625, "reward_std": 0.39995333552360535, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.77783203125, "rewards/ppl_reward/std": 2.7455992698669434, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 117.984375, "completions/mean_terminated_length": 117.984375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.35089856838257694, "grad_norm": 2.091575860977173, "kl": 0.619140625, "learning_rate": 1.4e-05, "loss": -0.001, "num_tokens": 5063110.0, "reward": -2.7744140625, "reward_std": 0.4559948444366455, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -9.509765625, "rewards/ppl_reward/std": 9.662552833557129, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 160.328125, "completions/mean_terminated_length": 160.328125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.35211696618946087, "grad_norm": 1.7969919443130493, "kl": 0.52392578125, "learning_rate": 1.4048780487804879e-05, "loss": 0.0791, "num_tokens": 5081467.0, "reward": -1.18731689453125, "reward_std": 0.7107214331626892, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.3277587890625, "rewards/ppl_reward/std": 6.994799613952637, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 153.671875, "completions/mean_terminated_length": 153.671875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.3533353639963448, "grad_norm": 1.8128790855407715, "kl": 0.5712890625, "learning_rate": 1.4097560975609756e-05, "loss": 0.0448, "num_tokens": 5098286.0, "reward": -0.29541015625, "reward_std": 0.3686411678791046, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.5283203125, "rewards/ppl_reward/std": 1.6370151042938232, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 144.359375, "completions/mean_terminated_length": 144.359375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.35455376180322873, "grad_norm": 1.8342365026474, "kl": 0.580078125, "learning_rate": 1.4146341463414635e-05, "loss": 0.007, "num_tokens": 5114197.0, "reward": -1.7227783203125, "reward_std": 0.38535308837890625, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.414306640625, "rewards/ppl_reward/std": 5.699108123779297, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 136.734375, "completions/mean_terminated_length": 136.734375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.3557721596101127, "grad_norm": 1.9415675401687622, "kl": 0.6318359375, "learning_rate": 1.4195121951219512e-05, "loss": 0.0102, "num_tokens": 5129804.0, "reward": -1.12744140625, "reward_std": 0.3162895739078522, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.1455078125, "rewards/ppl_reward/std": 2.6862494945526123, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 162.703125, "completions/mean_terminated_length": 162.703125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.35699055741699665, "grad_norm": 1.6631964445114136, "kl": 0.5771484375, "learning_rate": 1.4243902439024391e-05, "loss": -0.0291, "num_tokens": 5147297.0, "reward": -2.46484375, "reward_std": 0.5331830382347107, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -8.8125, "rewards/ppl_reward/std": 6.718303203582764, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 151.09375, "completions/mean_terminated_length": 151.09375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.3582089552238806, "grad_norm": 1.9875620603561401, "kl": 0.630859375, "learning_rate": 1.4292682926829269e-05, "loss": -0.0165, "num_tokens": 5164151.0, "reward": -1.0997314453125, "reward_std": 0.301941454410553, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.199462890625, "rewards/ppl_reward/std": 4.459592819213867, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 176.90625, "completions/mean_terminated_length": 176.90625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.35942735303076456, "grad_norm": 1.8461755514144897, "kl": 0.5888671875, "learning_rate": 1.4341463414634148e-05, "loss": 0.0643, "num_tokens": 5183217.0, "reward": -0.9315185546875, "reward_std": 0.31258076429367065, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.730224609375, "rewards/ppl_reward/std": 2.212094783782959, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 164.296875, "completions/mean_terminated_length": 164.296875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.3606457508376485, "grad_norm": 1.8024845123291016, "kl": 0.5947265625, "learning_rate": 1.4390243902439025e-05, "loss": -0.0445, "num_tokens": 5200788.0, "reward": -1.52001953125, "reward_std": 0.23936578631401062, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.0400390625, "rewards/ppl_reward/std": 2.126957416534424, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 178.109375, "completions/mean_terminated_length": 178.109375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.3618641486445324, "grad_norm": 1.792204737663269, "kl": 0.6083984375, "learning_rate": 1.4439024390243904e-05, "loss": -0.0034, "num_tokens": 5219883.0, "reward": -0.1949462890625, "reward_std": 0.14860419929027557, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -4.358642578125, "rewards/ppl_reward/std": 1.6626403331756592, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 187.046875, "completions/mean_terminated_length": 187.046875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.3630825464514164, "grad_norm": 1.5939313173294067, "kl": 0.6015625, "learning_rate": 1.4487804878048781e-05, "loss": -0.0282, "num_tokens": 5238230.0, "reward": -1.6317138671875, "reward_std": 0.35433685779571533, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.232177734375, "rewards/ppl_reward/std": 6.044897556304932, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 192.375, "completions/mean_terminated_length": 192.375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.36430094425830034, "grad_norm": 1.6705119609832764, "kl": 0.5986328125, "learning_rate": 1.4536585365853658e-05, "loss": -0.0039, "num_tokens": 5256846.0, "reward": -1.6051025390625, "reward_std": 0.37390297651290894, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.171142578125, "rewards/ppl_reward/std": 4.692623138427734, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 224.8125, "completions/mean_terminated_length": 224.8125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.36551934206518427, "grad_norm": 1.8631784915924072, "kl": 0.6201171875, "learning_rate": 1.4585365853658537e-05, "loss": 0.0608, "num_tokens": 5278010.0, "reward": -3.01171875, "reward_std": 0.5828479528427124, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -9.9375, "rewards/ppl_reward/std": 7.64474630355835, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 217.96875, "completions/mean_terminated_length": 217.96875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.36673773987206826, "grad_norm": 1.655655860900879, "kl": 0.6142578125, "learning_rate": 1.4634146341463415e-05, "loss": 0.0578, "num_tokens": 5298880.0, "reward": -1.2747802734375, "reward_std": 0.6052305102348328, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.510498046875, "rewards/ppl_reward/std": 4.040894508361816, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 240.828125, "completions/mean_terminated_length": 240.828125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3679561376789522, "grad_norm": 1.6004148721694946, "kl": 0.5966796875, "learning_rate": 1.4682926829268294e-05, "loss": 0.08, "num_tokens": 5320981.0, "reward": -0.9781494140625, "reward_std": 0.3505082130432129, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.800048828125, "rewards/ppl_reward/std": 2.143390655517578, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 228.640625, "completions/mean_terminated_length": 228.640625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.3691745354858361, "grad_norm": 1.6339210271835327, "kl": 0.603515625, "learning_rate": 1.4731707317073171e-05, "loss": 0.0421, "num_tokens": 5342142.0, "reward": -3.3309326171875, "reward_std": 0.4798421561717987, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -10.630615234375, "rewards/ppl_reward/std": 12.933063507080078, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 222.015625, "completions/mean_terminated_length": 222.015625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.37039293329272005, "grad_norm": 1.7217494249343872, "kl": 0.595703125, "learning_rate": 1.478048780487805e-05, "loss": 0.0888, "num_tokens": 5362839.0, "reward": -0.60137939453125, "reward_std": 0.30693572759628296, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.1949462890625, "rewards/ppl_reward/std": 2.941478967666626, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 213.328125, "completions/mean_terminated_length": 213.328125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.37161133109960404, "grad_norm": 1.7616515159606934, "kl": 0.6474609375, "learning_rate": 1.4829268292682927e-05, "loss": -0.0191, "num_tokens": 5382932.0, "reward": -0.44207763671875, "reward_std": 0.24685019254684448, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.7825927734375, "rewards/ppl_reward/std": 2.0946297645568848, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09241779148578644, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 213.9375, "completions/mean_terminated_length": 201.07937622070312, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.37282972890648797, "grad_norm": 1.9673765897750854, "kl": 0.6708984375, "learning_rate": 1.4878048780487806e-05, "loss": 0.0801, "num_tokens": 5403440.0, "reward": -3.0689697265625, "reward_std": 0.48081111907958984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -10.020751953125, "rewards/ppl_reward/std": 6.961225509643555, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 202.4375, "completions/mean_terminated_length": 202.4375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.3740481267133719, "grad_norm": 1.6639165878295898, "kl": 0.689453125, "learning_rate": 1.4926829268292684e-05, "loss": 0.0119, "num_tokens": 5423020.0, "reward": -1.568115234375, "reward_std": 0.49066948890686035, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.97216796875, "rewards/ppl_reward/std": 7.440083026885986, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 240.90625, "completions/mean_terminated_length": 240.90625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.3752665245202559, "grad_norm": 1.4520169496536255, "kl": 0.6201171875, "learning_rate": 1.4975609756097563e-05, "loss": 0.0258, "num_tokens": 5446246.0, "reward": -0.864501953125, "reward_std": 0.2189355194568634, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.65869140625, "rewards/ppl_reward/std": 3.217005491256714, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 238.453125, "completions/mean_terminated_length": 238.453125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.3764849223271398, "grad_norm": 1.6151174306869507, "kl": 0.634765625, "learning_rate": 1.502439024390244e-05, "loss": 0.0779, "num_tokens": 5468515.0, "reward": -1.326416015625, "reward_std": 0.2865449786186218, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.52783203125, "rewards/ppl_reward/std": 2.1461827754974365, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 242.875, "completions/mean_terminated_length": 230.4761962890625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.37770332013402375, "grad_norm": 1.7111561298370361, "kl": 0.650390625, "learning_rate": 1.5073170731707319e-05, "loss": 0.1579, "num_tokens": 5491507.0, "reward": -2.2423095703125, "reward_std": 0.8332671523094177, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.195556640625, "rewards/ppl_reward/std": 4.596916675567627, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1249379813671112, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 195.125, "completions/mean_terminated_length": 195.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.37892171794090773, "grad_norm": 1.5880861282348633, "kl": 0.7060546875, "learning_rate": 1.5121951219512196e-05, "loss": 0.0065, "num_tokens": 5510947.0, "reward": -1.7218017578125, "reward_std": 0.5228762626647949, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.310791015625, "rewards/ppl_reward/std": 4.3987812995910645, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09241779148578644, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 189.109375, "completions/mean_terminated_length": 189.109375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.38014011574779166, "grad_norm": 1.8501845598220825, "kl": 0.6982421875, "learning_rate": 1.5170731707317075e-05, "loss": 0.0972, "num_tokens": 5529682.0, "reward": -1.12103271484375, "reward_std": 0.5255616903305054, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.48795005679130554, "rewards/ppl_reward/mean": -5.3201904296875, "rewards/ppl_reward/std": 2.5508859157562256, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 169.125, "completions/mean_terminated_length": 169.125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.3813585135546756, "grad_norm": 1.9946084022521973, "kl": 0.734375, "learning_rate": 1.5219512195121952e-05, "loss": 0.0977, "num_tokens": 5547890.0, "reward": -2.599609375, "reward_std": 0.7280975580215454, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -8.48828125, "rewards/ppl_reward/std": 6.803762912750244, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.13858474791049957, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 167.890625, "completions/mean_terminated_length": 167.890625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3825769113615595, "grad_norm": 1.6974294185638428, "kl": 0.7353515625, "learning_rate": 1.526829268292683e-05, "loss": -0.0025, "num_tokens": 5565707.0, "reward": -1.699951171875, "reward_std": 0.5118356347084045, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.04833984375, "rewards/ppl_reward/std": 1.422268033027649, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.0875956118106842, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.3837953091684435, "grad_norm": 1.8715718984603882, "kl": 0.673828125, "learning_rate": 1.531707317073171e-05, "loss": 0.0174, "num_tokens": 5584595.0, "reward": -1.12646484375, "reward_std": 0.3726627826690674, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.1279296875, "rewards/ppl_reward/std": 3.451587677001953, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 149.3125, "completions/mean_terminated_length": 149.3125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.38501370697532744, "grad_norm": 1.922775387763977, "kl": 0.74609375, "learning_rate": 1.5365853658536586e-05, "loss": -0.0242, "num_tokens": 5600903.0, "reward": -0.8028564453125, "reward_std": 0.31182223558425903, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.433837890625, "rewards/ppl_reward/std": 2.5992624759674072, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 142.03125, "completions/mean_terminated_length": 142.03125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.38623210478221137, "grad_norm": 2.06850266456604, "kl": 0.744140625, "learning_rate": 1.5414634146341465e-05, "loss": 0.0375, "num_tokens": 5617217.0, "reward": -0.7144775390625, "reward_std": 0.323103129863739, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.264892578125, "rewards/ppl_reward/std": 2.0959577560424805, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09241779148578644, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.38745050258909536, "grad_norm": 1.9634805917739868, "kl": 0.716796875, "learning_rate": 1.5463414634146344e-05, "loss": -0.032, "num_tokens": 5633929.0, "reward": -0.3243408203125, "reward_std": 0.23659369349479675, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.539306640625, "rewards/ppl_reward/std": 1.931020736694336, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 136.3125, "completions/mean_terminated_length": 136.3125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.3886689003959793, "grad_norm": 2.093852996826172, "kl": 0.7763671875, "learning_rate": 1.551219512195122e-05, "loss": -0.0567, "num_tokens": 5649565.0, "reward": -4.8524169921875, "reward_std": 0.8114759922027588, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -13.587646484375, "rewards/ppl_reward/std": 10.138453483581543, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 169.671875, "completions/mean_terminated_length": 169.671875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.3898872982028632, "grad_norm": 1.6714363098144531, "kl": 0.7001953125, "learning_rate": 1.55609756097561e-05, "loss": -0.0444, "num_tokens": 5668200.0, "reward": -0.10107421875, "reward_std": 0.31322646141052246, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.0537109375, "rewards/ppl_reward/std": 1.3625566959381104, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 157.390625, "completions/mean_terminated_length": 157.390625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.3911056960097472, "grad_norm": 1.8232965469360352, "kl": 0.71875, "learning_rate": 1.5609756097560978e-05, "loss": 0.0147, "num_tokens": 5685321.0, "reward": -1.06396484375, "reward_std": 0.3594765067100525, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.0732421875, "rewards/ppl_reward/std": 2.334777593612671, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 202.546875, "completions/mean_terminated_length": 202.546875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.39232409381663114, "grad_norm": 1.4959042072296143, "kl": 0.65234375, "learning_rate": 1.5658536585365857e-05, "loss": -0.0215, "num_tokens": 5705572.0, "reward": -2.718505859375, "reward_std": 0.604778528213501, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -9.22607421875, "rewards/ppl_reward/std": 9.79585075378418, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07864411175251007, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 216.609375, "completions/mean_terminated_length": 216.609375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.39354249162351507, "grad_norm": 1.3328492641448975, "kl": 0.58984375, "learning_rate": 1.5707317073170732e-05, "loss": 0.0562, "num_tokens": 5726139.0, "reward": -2.74951171875, "reward_std": 0.3497428297996521, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -9.4287109375, "rewards/ppl_reward/std": 5.3133745193481445, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 260.59375, "completions/mean_terminated_length": 260.59375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.394760889430399, "grad_norm": 1.3641786575317383, "kl": 0.52294921875, "learning_rate": 1.575609756097561e-05, "loss": 0.0219, "num_tokens": 5750393.0, "reward": -2.3101806640625, "reward_std": 0.2977946698665619, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -8.510986328125, "rewards/ppl_reward/std": 3.677238941192627, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 273.71875, "completions/mean_terminated_length": 273.71875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.395979287237283, "grad_norm": 1.3354986906051636, "kl": 0.51904296875, "learning_rate": 1.580487804878049e-05, "loss": -0.0462, "num_tokens": 5774135.0, "reward": -1.19842529296875, "reward_std": 0.35152310132980347, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.3577880859375, "rewards/ppl_reward/std": 5.574345111846924, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 364.515625, "completions/mean_terminated_length": 354.0476379394531, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.3971976850441669, "grad_norm": 1.226672887802124, "kl": 0.48291015625, "learning_rate": 1.585365853658537e-05, "loss": 0.0385, "num_tokens": 5804960.0, "reward": -1.2366943359375, "reward_std": 0.38889461755752563, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.332763671875, "rewards/ppl_reward/std": 1.6652121543884277, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 431.546875, "completions/mean_terminated_length": 422.14288330078125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.39841608285105085, "grad_norm": 1.3611916303634644, "kl": 0.52734375, "learning_rate": 1.5902439024390245e-05, "loss": -0.0183, "num_tokens": 5839939.0, "reward": -1.0042724609375, "reward_std": 0.3898901343345642, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.922607421875, "rewards/ppl_reward/std": 2.0954437255859375, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 367.453125, "completions/mean_terminated_length": 367.453125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.39963448065793483, "grad_norm": 1.0392358303070068, "kl": 0.474609375, "learning_rate": 1.5951219512195124e-05, "loss": 0.0234, "num_tokens": 5870416.0, "reward": -0.05426025390625, "reward_std": 0.13598230481147766, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.1085205078125, "rewards/ppl_reward/std": 1.6874754428863525, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 316.359375, "completions/mean_terminated_length": 316.359375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.40085287846481876, "grad_norm": 1.092799186706543, "kl": 0.52880859375, "learning_rate": 1.6000000000000003e-05, "loss": -0.0691, "num_tokens": 5897095.0, "reward": -0.138427734375, "reward_std": 0.18317291140556335, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.27685546875, "rewards/ppl_reward/std": 1.3972082138061523, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 296.953125, "completions/mean_terminated_length": 285.4127197265625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.4020712762717027, "grad_norm": 1.2575397491455078, "kl": 0.509765625, "learning_rate": 1.604878048780488e-05, "loss": 0.0068, "num_tokens": 5923204.0, "reward": -1.1265869140625, "reward_std": 0.24523073434829712, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.143798828125, "rewards/ppl_reward/std": 3.4950785636901855, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 333.359375, "completions/mean_terminated_length": 322.3968505859375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.4032896740785867, "grad_norm": 1.2014464139938354, "kl": 0.49560546875, "learning_rate": 1.6097560975609757e-05, "loss": 0.0943, "num_tokens": 5951739.0, "reward": -0.5950927734375, "reward_std": 0.22592678666114807, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.151123046875, "rewards/ppl_reward/std": 1.2727562189102173, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 269.234375, "completions/mean_terminated_length": 269.234375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.4045080718854706, "grad_norm": 1.3571531772613525, "kl": 0.5908203125, "learning_rate": 1.6146341463414636e-05, "loss": 0.0011, "num_tokens": 5975074.0, "reward": -0.887451171875, "reward_std": 0.4408779740333557, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.73583984375, "rewards/ppl_reward/std": 3.5293562412261963, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 269.59375, "completions/mean_terminated_length": 269.59375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.40572646969235454, "grad_norm": 1.251836895942688, "kl": 0.5830078125, "learning_rate": 1.6195121951219515e-05, "loss": 0.0685, "num_tokens": 5999080.0, "reward": -1.70458984375, "reward_std": 0.42401838302612305, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.4091796875, "rewards/ppl_reward/std": 5.76924467086792, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 247.9375, "completions/mean_terminated_length": 247.9375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.4069448674992385, "grad_norm": 1.3308767080307007, "kl": 0.572265625, "learning_rate": 1.6243902439024394e-05, "loss": -0.0118, "num_tokens": 6022372.0, "reward": -0.605224609375, "reward_std": 0.1863909214735031, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.21044921875, "rewards/ppl_reward/std": 2.692749261856079, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 229.640625, "completions/mean_terminated_length": 229.640625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.40816326530612246, "grad_norm": 1.2486354112625122, "kl": 0.580078125, "learning_rate": 1.629268292682927e-05, "loss": 0.0042, "num_tokens": 6044845.0, "reward": -2.0552978515625, "reward_std": 0.28639692068099976, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.110595703125, "rewards/ppl_reward/std": 6.012795448303223, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 224.34375, "completions/mean_terminated_length": 224.34375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.4093816631130064, "grad_norm": 1.2533490657806396, "kl": 0.541015625, "learning_rate": 1.6341463414634145e-05, "loss": 0.0351, "num_tokens": 6066315.0, "reward": -0.65533447265625, "reward_std": 0.29511070251464844, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.3028564453125, "rewards/ppl_reward/std": 2.0028328895568848, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 195.5, "completions/mean_terminated_length": 195.5, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.4106000609198903, "grad_norm": 1.293014407157898, "kl": 0.6201171875, "learning_rate": 1.6390243902439024e-05, "loss": -0.0874, "num_tokens": 6085931.0, "reward": -7.01708984375, "reward_std": 2.7051069736480713, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -17.8857421875, "rewards/ppl_reward/std": 33.671810150146484, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 188.0, "completions/mean_terminated_length": 188.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.4118184587267743, "grad_norm": 1.4397590160369873, "kl": 0.58984375, "learning_rate": 1.6439024390243903e-05, "loss": -0.0175, "num_tokens": 6104883.0, "reward": -1.1025390625, "reward_std": 0.30214542150497437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.205078125, "rewards/ppl_reward/std": 2.885042667388916, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 209.171875, "completions/mean_terminated_length": 209.171875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.41303685653365824, "grad_norm": 1.3514671325683594, "kl": 0.6044921875, "learning_rate": 1.6487804878048782e-05, "loss": 0.0344, "num_tokens": 6124758.0, "reward": -0.7880859375, "reward_std": 0.24253147840499878, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.576171875, "rewards/ppl_reward/std": 3.044945001602173, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 210.71875, "completions/mean_terminated_length": 210.71875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.41425525434054217, "grad_norm": 1.2715760469436646, "kl": 0.54345703125, "learning_rate": 1.6536585365853658e-05, "loss": -0.0119, "num_tokens": 6145324.0, "reward": -2.1505126953125, "reward_std": 0.40435442328453064, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.301025390625, "rewards/ppl_reward/std": 10.98659896850586, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 221.875, "completions/mean_terminated_length": 221.875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.41547365214742615, "grad_norm": 1.386276125907898, "kl": 0.626953125, "learning_rate": 1.6585365853658537e-05, "loss": 0.017, "num_tokens": 6165788.0, "reward": -0.9381103515625, "reward_std": 0.44857257604599, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.837158203125, "rewards/ppl_reward/std": 3.7989814281463623, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 225.125, "completions/mean_terminated_length": 225.125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.4166920499543101, "grad_norm": 1.2564367055892944, "kl": 0.5615234375, "learning_rate": 1.6634146341463416e-05, "loss": 0.0226, "num_tokens": 6187396.0, "reward": -1.962646484375, "reward_std": 0.31680619716644287, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.88623046875, "rewards/ppl_reward/std": 5.261635780334473, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.417910447761194, "grad_norm": 1.5751864910125732, "kl": 0.62890625, "learning_rate": 1.6682926829268295e-05, "loss": 0.0321, "num_tokens": 6207460.0, "reward": -0.9344482421875, "reward_std": 0.38522857427597046, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.782958984375, "rewards/ppl_reward/std": 2.8562231063842773, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 230.78125, "completions/mean_terminated_length": 230.78125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.419128845568078, "grad_norm": 1.227177619934082, "kl": 0.5791015625, "learning_rate": 1.673170731707317e-05, "loss": -0.0036, "num_tokens": 6229590.0, "reward": -1.7913818359375, "reward_std": 0.47378695011138916, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.535888671875, "rewards/ppl_reward/std": 5.899801731109619, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 192.921875, "completions/mean_terminated_length": 192.921875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.42034724337496193, "grad_norm": 1.460483193397522, "kl": 0.6142578125, "learning_rate": 1.678048780487805e-05, "loss": -0.0423, "num_tokens": 6248857.0, "reward": -1.9259033203125, "reward_std": 0.41743040084838867, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.851806640625, "rewards/ppl_reward/std": 7.055027961730957, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 222.4375, "completions/mean_terminated_length": 209.71429443359375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.42156564118184586, "grad_norm": 1.4052563905715942, "kl": 0.638671875, "learning_rate": 1.682926829268293e-05, "loss": -0.0033, "num_tokens": 6269901.0, "reward": -1.065185546875, "reward_std": 0.3438885509967804, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.05224609375, "rewards/ppl_reward/std": 3.0571939945220947, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 203.890625, "completions/mean_terminated_length": 203.890625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.4227840389887298, "grad_norm": 1.4336719512939453, "kl": 0.6162109375, "learning_rate": 1.6878048780487804e-05, "loss": 0.0355, "num_tokens": 6289462.0, "reward": -1.49169921875, "reward_std": 0.5243953466415405, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.8974609375, "rewards/ppl_reward/std": 5.098904132843018, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 206.125, "completions/mean_terminated_length": 206.125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.4240024367956138, "grad_norm": 1.3638347387313843, "kl": 0.630859375, "learning_rate": 1.6926829268292683e-05, "loss": 0.0253, "num_tokens": 6309590.0, "reward": -1.763916015625, "reward_std": 0.5017063021659851, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.52783203125, "rewards/ppl_reward/std": 5.7263288497924805, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 219.484375, "completions/mean_terminated_length": 219.484375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.4252208346024977, "grad_norm": 1.427025318145752, "kl": 0.6142578125, "learning_rate": 1.6975609756097562e-05, "loss": -0.0572, "num_tokens": 6330533.0, "reward": -0.718994140625, "reward_std": 0.2711050510406494, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.43017578125, "rewards/ppl_reward/std": 2.631356716156006, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 252.59375, "completions/mean_terminated_length": 252.59375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.42643923240938164, "grad_norm": 1.2365049123764038, "kl": 0.54931640625, "learning_rate": 1.702439024390244e-05, "loss": -0.0815, "num_tokens": 6353467.0, "reward": -2.2249755859375, "reward_std": 0.7513050436973572, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.449951171875, "rewards/ppl_reward/std": 8.375478744506836, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 259.359375, "completions/mean_terminated_length": 234.69354248046875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.4276576302162656, "grad_norm": 1.1452304124832153, "kl": 0.49462890625, "learning_rate": 1.7073170731707317e-05, "loss": -0.1692, "num_tokens": 6376922.0, "reward": -2.9666748046875, "reward_std": 0.5319024324417114, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.933349609375, "rewards/ppl_reward/std": 6.0822954177856445, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 302.125, "completions/mean_terminated_length": 266.6229248046875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.42887602802314956, "grad_norm": 1.3898377418518066, "kl": 0.4931640625, "learning_rate": 1.7121951219512196e-05, "loss": 0.2331, "num_tokens": 6402938.0, "reward": -0.419677734375, "reward_std": 0.914198637008667, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.62841796875, "rewards/ppl_reward/std": 2.5486319065093994, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 382.765625, "completions/mean_terminated_length": 362.08062744140625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.4300944258300335, "grad_norm": 1.1687575578689575, "kl": 0.41845703125, "learning_rate": 1.7170731707317075e-05, "loss": 0.1958, "num_tokens": 6434923.0, "reward": -2.0167236328125, "reward_std": 0.540976345539093, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -7.963134765625, "rewards/ppl_reward/std": 4.040082931518555, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 303.78125, "completions/mean_terminated_length": 292.3492126464844, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.4313128236369175, "grad_norm": 1.1794167757034302, "kl": 0.4365234375, "learning_rate": 1.7219512195121954e-05, "loss": 0.0559, "num_tokens": 6461421.0, "reward": -0.49169921875, "reward_std": 0.26778507232666016, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.9755859375, "rewards/ppl_reward/std": 1.2821143865585327, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 256.015625, "completions/mean_terminated_length": 231.24192810058594, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.4325312214438014, "grad_norm": 1.330701470375061, "kl": 0.4501953125, "learning_rate": 1.726829268292683e-05, "loss": -0.0376, "num_tokens": 6484454.0, "reward": -2.671142578125, "reward_std": 0.6437525749206543, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.34228515625, "rewards/ppl_reward/std": 4.999107837677002, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 259.671875, "completions/mean_terminated_length": 259.671875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.43374961925068534, "grad_norm": 1.3390755653381348, "kl": 0.470703125, "learning_rate": 1.7317073170731708e-05, "loss": 0.1169, "num_tokens": 6508385.0, "reward": -0.464111328125, "reward_std": 0.3498694598674774, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.92041015625, "rewards/ppl_reward/std": 2.8004860877990723, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 245.515625, "completions/mean_terminated_length": 220.40321350097656, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4349680170575693, "grad_norm": 1.476174235343933, "kl": 0.5439453125, "learning_rate": 1.7365853658536587e-05, "loss": -0.08, "num_tokens": 6530938.0, "reward": -1.58740234375, "reward_std": 0.7027636170387268, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.1748046875, "rewards/ppl_reward/std": 5.4212422370910645, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 266.234375, "completions/mean_terminated_length": 228.96719360351562, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.43618641486445325, "grad_norm": 1.181464672088623, "kl": 0.46435546875, "learning_rate": 1.7414634146341466e-05, "loss": 0.0111, "num_tokens": 6555073.0, "reward": -0.80322265625, "reward_std": 0.27028048038482666, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.5986328125, "rewards/ppl_reward/std": 3.2608656883239746, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 226.03125, "completions/mean_terminated_length": 200.29031372070312, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.4374048126713372, "grad_norm": 1.3808578252792358, "kl": 0.466796875, "learning_rate": 1.7463414634146342e-05, "loss": 0.057, "num_tokens": 6576851.0, "reward": -0.5850830078125, "reward_std": 0.28356456756591797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.170166015625, "rewards/ppl_reward/std": 1.6626209020614624, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 246.328125, "completions/mean_terminated_length": 208.0819549560547, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.4386232104782211, "grad_norm": 1.470029354095459, "kl": 0.42724609375, "learning_rate": 1.751219512195122e-05, "loss": 0.1197, "num_tokens": 6599408.0, "reward": -7.4500732421875, "reward_std": 2.9720356464385986, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -18.798583984375, "rewards/ppl_reward/std": 39.121150970458984, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 250.921875, "completions/mean_terminated_length": 212.90162658691406, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.4398416082851051, "grad_norm": 1.2587963342666626, "kl": 0.4375, "learning_rate": 1.75609756097561e-05, "loss": -0.0596, "num_tokens": 6621963.0, "reward": -1.0557861328125, "reward_std": 0.33992475271224976, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.111572265625, "rewards/ppl_reward/std": 3.2889575958251953, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 207.953125, "completions/mean_terminated_length": 181.6290283203125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.44106000609198903, "grad_norm": 1.5729153156280518, "kl": 0.4794921875, "learning_rate": 1.760975609756098e-05, "loss": 0.0807, "num_tokens": 6642704.0, "reward": -1.3529052734375, "reward_std": 0.29466092586517334, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.674560546875, "rewards/ppl_reward/std": 1.8768479824066162, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 197.359375, "completions/mean_terminated_length": 184.2381134033203, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.44227840389887296, "grad_norm": 1.5458526611328125, "kl": 0.49267578125, "learning_rate": 1.7658536585365854e-05, "loss": 0.0476, "num_tokens": 6662215.0, "reward": -1.4219970703125, "reward_std": 0.39813053607940674, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.804931640625, "rewards/ppl_reward/std": 4.152256011962891, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 185.21875, "completions/mean_terminated_length": 171.90476989746094, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.44349680170575695, "grad_norm": 1.5948647260665894, "kl": 0.4775390625, "learning_rate": 1.7707317073170733e-05, "loss": 0.0962, "num_tokens": 6680725.0, "reward": -0.4041748046875, "reward_std": 0.3865489363670349, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.659912109375, "rewards/ppl_reward/std": 2.63649845123291, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 144.40625, "completions/mean_terminated_length": 130.4444580078125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.4447151995126409, "grad_norm": 1.759537935256958, "kl": 0.53759765625, "learning_rate": 1.7756097560975612e-05, "loss": -0.0155, "num_tokens": 6696303.0, "reward": -2.3221435546875, "reward_std": 0.48489871621131897, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -8.613037109375, "rewards/ppl_reward/std": 3.004516124725342, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 136.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4459335973195248, "grad_norm": 2.19347882270813, "kl": 0.62109375, "learning_rate": 1.7804878048780488e-05, "loss": -0.0118, "num_tokens": 6711375.0, "reward": -0.7880859375, "reward_std": 0.2516166567802429, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.576171875, "rewards/ppl_reward/std": 2.2559866905212402, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 140.0625, "completions/mean_terminated_length": 140.0625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4471519951264088, "grad_norm": 1.948355793952942, "kl": 0.552734375, "learning_rate": 1.7853658536585367e-05, "loss": 0.0633, "num_tokens": 6727075.0, "reward": -1.24755859375, "reward_std": 0.304685115814209, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.4248046875, "rewards/ppl_reward/std": 3.963653564453125, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4483703929332927, "grad_norm": 1.8451787233352661, "kl": 0.50537109375, "learning_rate": 1.7902439024390246e-05, "loss": 0.0153, "num_tokens": 6743107.0, "reward": -4.4244384765625, "reward_std": 1.0996887683868408, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -12.786376953125, "rewards/ppl_reward/std": 20.840728759765625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 144.25, "completions/mean_terminated_length": 144.25, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.44958879074017666, "grad_norm": 1.7788459062576294, "kl": 0.5322265625, "learning_rate": 1.7951219512195125e-05, "loss": 0.0454, "num_tokens": 6759451.0, "reward": -0.870849609375, "reward_std": 0.3771709203720093, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.70263671875, "rewards/ppl_reward/std": 2.900693655014038, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 150.796875, "completions/mean_terminated_length": 150.796875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.4508071885470606, "grad_norm": 1.7875982522964478, "kl": 0.525390625, "learning_rate": 1.8e-05, "loss": 0.0433, "num_tokens": 6776270.0, "reward": -1.3133544921875, "reward_std": 0.3880191147327423, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.470458984375, "rewards/ppl_reward/std": 3.0636045932769775, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 150.15625, "completions/mean_terminated_length": 150.15625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.4520255863539446, "grad_norm": 1.9870625734329224, "kl": 0.556640625, "learning_rate": 1.804878048780488e-05, "loss": 0.0024, "num_tokens": 6792664.0, "reward": -1.0458984375, "reward_std": 0.2860358953475952, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.021484375, "rewards/ppl_reward/std": 2.470820426940918, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 143.6875, "completions/mean_terminated_length": 143.6875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.4532439841608285, "grad_norm": 1.6927814483642578, "kl": 0.541015625, "learning_rate": 1.809756097560976e-05, "loss": 0.0486, "num_tokens": 6809092.0, "reward": -2.4254150390625, "reward_std": 0.39007264375686646, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -8.811767578125, "rewards/ppl_reward/std": 8.938628196716309, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 132.890625, "completions/mean_terminated_length": 132.890625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.45446238196771244, "grad_norm": 1.870866060256958, "kl": 0.5732421875, "learning_rate": 1.8146341463414637e-05, "loss": 0.0417, "num_tokens": 6824285.0, "reward": -4.49432373046875, "reward_std": 1.221816897392273, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -12.6995849609375, "rewards/ppl_reward/std": 15.220105171203613, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1249379813671112, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 128.234375, "completions/mean_terminated_length": 128.234375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.4556807797745964, "grad_norm": 2.3595104217529297, "kl": 0.603515625, "learning_rate": 1.8195121951219513e-05, "loss": 0.0191, "num_tokens": 6838868.0, "reward": -2.33984375, "reward_std": 1.4411020278930664, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.3046875, "rewards/ppl_reward/std": 4.887476444244385, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2514837086200714, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 122.5, "completions/mean_terminated_length": 122.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.45689917758148035, "grad_norm": 1.869585633277893, "kl": 0.5986328125, "learning_rate": 1.8243902439024392e-05, "loss": 0.0313, "num_tokens": 6853332.0, "reward": -3.28778076171875, "reward_std": 0.6961699724197388, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -10.4661865234375, "rewards/ppl_reward/std": 8.58242416381836, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 139.75, "completions/mean_terminated_length": 139.75, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.4581175753883643, "grad_norm": 1.7740814685821533, "kl": 0.5517578125, "learning_rate": 1.829268292682927e-05, "loss": 0.0148, "num_tokens": 6869404.0, "reward": -1.59600830078125, "reward_std": 0.3383733630180359, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.1920166015625, "rewards/ppl_reward/std": 5.784573078155518, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 134.0625, "completions/mean_terminated_length": 134.0625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.45933597319524827, "grad_norm": 1.9370981454849243, "kl": 0.6416015625, "learning_rate": 1.834146341463415e-05, "loss": 0.0178, "num_tokens": 6885544.0, "reward": 0.1373291015625, "reward_std": 0.21941658854484558, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -3.631591796875, "rewards/ppl_reward/std": 0.9235897660255432, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08768405020236969, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 128.5625, "completions/mean_terminated_length": 128.5625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.4605543710021322, "grad_norm": 1.7047014236450195, "kl": 0.6181640625, "learning_rate": 1.8390243902439026e-05, "loss": -0.0564, "num_tokens": 6900044.0, "reward": -2.3145751953125, "reward_std": 0.5756134390830994, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -8.582275390625, "rewards/ppl_reward/std": 4.704937934875488, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 127.625, "completions/mean_terminated_length": 127.625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.46177276880901613, "grad_norm": 1.7707440853118896, "kl": 0.7548828125, "learning_rate": 1.8439024390243905e-05, "loss": -0.0177, "num_tokens": 6914484.0, "reward": -3.3052978515625, "reward_std": 0.7109465599060059, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -10.493408203125, "rewards/ppl_reward/std": 8.246628761291504, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 153.5625, "completions/mean_terminated_length": 153.5625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4629911666159001, "grad_norm": 1.6636971235275269, "kl": 0.6513671875, "learning_rate": 1.8487804878048784e-05, "loss": 0.0134, "num_tokens": 6931016.0, "reward": -2.1763916015625, "reward_std": 0.37464606761932373, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.173095703125, "rewards/ppl_reward/std": 7.974277496337891, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 177.9375, "completions/mean_terminated_length": 177.9375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.46420956442278405, "grad_norm": 1.458424687385559, "kl": 0.6455078125, "learning_rate": 1.8536585365853663e-05, "loss": 0.0273, "num_tokens": 6949292.0, "reward": -0.938232421875, "reward_std": 0.3127153515815735, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.63427734375, "rewards/ppl_reward/std": 3.1145968437194824, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 215.8125, "completions/mean_terminated_length": 215.8125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.465427962229668, "grad_norm": 1.3414945602416992, "kl": 0.615234375, "learning_rate": 1.8585365853658538e-05, "loss": 0.0199, "num_tokens": 6970664.0, "reward": -1.4642333984375, "reward_std": 0.529543399810791, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.717529296875, "rewards/ppl_reward/std": 8.337900161743164, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 226.140625, "completions/mean_terminated_length": 200.40321350097656, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.4666463600365519, "grad_norm": 1.4398276805877686, "kl": 0.60107421875, "learning_rate": 1.8634146341463417e-05, "loss": 0.0998, "num_tokens": 6991769.0, "reward": -3.6300048828125, "reward_std": 0.6120027303695679, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -11.064697265625, "rewards/ppl_reward/std": 8.456076622009277, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 202.859375, "completions/mean_terminated_length": 202.859375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.4678647578434359, "grad_norm": 1.4912998676300049, "kl": 0.681640625, "learning_rate": 1.8682926829268296e-05, "loss": 0.0407, "num_tokens": 7012040.0, "reward": -0.6697998046875, "reward_std": 0.2086089849472046, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.214599609375, "rewards/ppl_reward/std": 1.9197380542755127, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 204.59375, "completions/mean_terminated_length": 204.59375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.4690831556503198, "grad_norm": 1.6355870962142944, "kl": 0.7001953125, "learning_rate": 1.8731707317073172e-05, "loss": 0.0631, "num_tokens": 7031382.0, "reward": -2.36480712890625, "reward_std": 1.6870973110198975, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.4093017578125, "rewards/ppl_reward/std": 9.869730949401855, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1677328199148178, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 164.859375, "completions/mean_terminated_length": 164.859375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.47030155345720376, "grad_norm": 1.7266676425933838, "kl": 0.8037109375, "learning_rate": 1.878048780487805e-05, "loss": 0.0378, "num_tokens": 7048349.0, "reward": -1.6536865234375, "reward_std": 1.0368950366973877, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.158935546875, "rewards/ppl_reward/std": 5.461587429046631, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 176.203125, "completions/mean_terminated_length": 176.203125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.47151995126408774, "grad_norm": 1.6105479001998901, "kl": 0.75, "learning_rate": 1.8829268292682926e-05, "loss": 0.023, "num_tokens": 7067114.0, "reward": -1.7099609375, "reward_std": 0.23140332102775574, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.380859375, "rewards/ppl_reward/std": 5.47029447555542, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 151.484375, "completions/mean_terminated_length": 151.484375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.4727383490709717, "grad_norm": 1.9192259311676025, "kl": 0.8388671875, "learning_rate": 1.8878048780487805e-05, "loss": 0.0347, "num_tokens": 7083417.0, "reward": -1.21875, "reward_std": 0.38148826360702515, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.28125, "rewards/ppl_reward/std": 4.660264015197754, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 138.328125, "completions/mean_terminated_length": 138.328125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.4739567468778556, "grad_norm": 1.8341161012649536, "kl": 0.8251953125, "learning_rate": 1.8926829268292684e-05, "loss": -0.027, "num_tokens": 7098902.0, "reward": -1.1326904296875, "reward_std": 0.22551517188549042, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.195068359375, "rewards/ppl_reward/std": 2.777834415435791, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 147.3125, "completions/mean_terminated_length": 147.3125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.4751751446847396, "grad_norm": 1.9308363199234009, "kl": 0.826171875, "learning_rate": 1.8975609756097563e-05, "loss": 0.0319, "num_tokens": 7115482.0, "reward": -1.6488037109375, "reward_std": 0.3707605004310608, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.071044921875, "rewards/ppl_reward/std": 4.3304948806762695, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.16810208559036255, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 150.796875, "completions/mean_terminated_length": 150.796875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.4763935424916235, "grad_norm": 1.8689100742340088, "kl": 0.787109375, "learning_rate": 1.902439024390244e-05, "loss": 0.0444, "num_tokens": 7133141.0, "reward": -1.59814453125, "reward_std": 0.19901889562606812, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.1572265625, "rewards/ppl_reward/std": 4.826797962188721, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 143.8125, "completions/mean_terminated_length": 143.8125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.47761194029850745, "grad_norm": 1.9270706176757812, "kl": 0.765625, "learning_rate": 1.9073170731707318e-05, "loss": 0.0398, "num_tokens": 7149785.0, "reward": -3.282470703125, "reward_std": 0.7294915914535522, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -10.52587890625, "rewards/ppl_reward/std": 13.881502151489258, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 165.90625, "completions/mean_terminated_length": 152.2857208251953, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.4788303381053914, "grad_norm": 2.0876569747924805, "kl": 0.701171875, "learning_rate": 1.9121951219512197e-05, "loss": 0.1907, "num_tokens": 7168019.0, "reward": -1.0391845703125, "reward_std": 0.7083539366722107, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.023681640625, "rewards/ppl_reward/std": 3.1655895709991455, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 139.6875, "completions/mean_terminated_length": 139.6875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.48004873591227537, "grad_norm": 1.9496978521347046, "kl": 0.7744140625, "learning_rate": 1.9170731707317072e-05, "loss": 0.0259, "num_tokens": 7183607.0, "reward": -1.0179443359375, "reward_std": 0.2107255756855011, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.035888671875, "rewards/ppl_reward/std": 2.1673331260681152, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 137.140625, "completions/mean_terminated_length": 137.140625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.4812671337191593, "grad_norm": 2.0072994232177734, "kl": 0.7626953125, "learning_rate": 1.921951219512195e-05, "loss": -0.0108, "num_tokens": 7199112.0, "reward": -5.513427734375, "reward_std": 1.215253472328186, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -15.02685546875, "rewards/ppl_reward/std": 17.72804832458496, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 153.421875, "completions/mean_terminated_length": 139.60317993164062, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.48248553152604323, "grad_norm": 1.928562045097351, "kl": 0.728515625, "learning_rate": 1.926829268292683e-05, "loss": -0.0923, "num_tokens": 7215827.0, "reward": -0.44232177734375, "reward_std": 0.19075147807598114, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.8846435546875, "rewards/ppl_reward/std": 1.876242756843567, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 157.421875, "completions/mean_terminated_length": 157.421875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.4837039293329272, "grad_norm": 1.861298680305481, "kl": 0.7177734375, "learning_rate": 1.931707317073171e-05, "loss": 0.0733, "num_tokens": 7232694.0, "reward": -1.2127685546875, "reward_std": 0.47112220525741577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.425537109375, "rewards/ppl_reward/std": 3.780320167541504, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 182.9375, "completions/mean_terminated_length": 169.58731079101562, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.48492232713981115, "grad_norm": 1.895073413848877, "kl": 0.642578125, "learning_rate": 1.9365853658536585e-05, "loss": 0.1687, "num_tokens": 7251610.0, "reward": -0.7115478515625, "reward_std": 0.2381569892168045, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.376220703125, "rewards/ppl_reward/std": 2.6068100929260254, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 196.546875, "completions/mean_terminated_length": 169.85482788085938, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.4861407249466951, "grad_norm": 1.8868906497955322, "kl": 0.6650390625, "learning_rate": 1.9414634146341464e-05, "loss": 0.2756, "num_tokens": 7271277.0, "reward": -0.882080078125, "reward_std": 0.3038502335548401, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.69384765625, "rewards/ppl_reward/std": 4.268520832061768, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1118449866771698, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 183.671875, "completions/mean_terminated_length": 183.671875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.48735912275357907, "grad_norm": 1.7125482559204102, "kl": 0.6884765625, "learning_rate": 1.9463414634146343e-05, "loss": 0.1058, "num_tokens": 7289752.0, "reward": -1.7225341796875, "reward_std": 0.2894718647003174, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.445068359375, "rewards/ppl_reward/std": 2.765293598175049, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 170.546875, "completions/mean_terminated_length": 170.546875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.488577520560463, "grad_norm": 1.7278586626052856, "kl": 0.6923828125, "learning_rate": 1.9512195121951222e-05, "loss": 0.0881, "num_tokens": 7307475.0, "reward": -0.50537109375, "reward_std": 0.15056367218494415, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.0107421875, "rewards/ppl_reward/std": 2.0803604125976562, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 173.6875, "completions/mean_terminated_length": 160.19049072265625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.4897959183673469, "grad_norm": 2.2412662506103516, "kl": 0.697265625, "learning_rate": 1.9560975609756098e-05, "loss": 0.1753, "num_tokens": 7325631.0, "reward": -1.1390380859375, "reward_std": 0.3519960343837738, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.207763671875, "rewards/ppl_reward/std": 4.602639675140381, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 171.09375, "completions/mean_terminated_length": 157.55555725097656, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.4910143161742309, "grad_norm": 2.1364662647247314, "kl": 0.736328125, "learning_rate": 1.9609756097560977e-05, "loss": 0.1965, "num_tokens": 7343605.0, "reward": -0.810302734375, "reward_std": 0.2708771824836731, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.54248046875, "rewards/ppl_reward/std": 5.072822570800781, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 166.4375, "completions/mean_terminated_length": 166.4375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.49223271398111484, "grad_norm": 1.620985746383667, "kl": 0.7587890625, "learning_rate": 1.9658536585365856e-05, "loss": 0.0436, "num_tokens": 7361865.0, "reward": -0.2178955078125, "reward_std": 0.10872030258178711, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.435791015625, "rewards/ppl_reward/std": 1.5361900329589844, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 124.53125, "completions/mean_terminated_length": 124.53125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.4934511117879988, "grad_norm": 1.8983322381973267, "kl": 0.9150390625, "learning_rate": 1.9707317073170734e-05, "loss": 0.0061, "num_tokens": 7376627.0, "reward": -1.1861572265625, "reward_std": 0.3632216155529022, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.247314453125, "rewards/ppl_reward/std": 3.3323843479156494, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 189.265625, "completions/mean_terminated_length": 148.21310424804688, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.4946695095948827, "grad_norm": 2.5103440284729004, "kl": 0.8486328125, "learning_rate": 1.975609756097561e-05, "loss": 0.261, "num_tokens": 7395556.0, "reward": -2.1561279296875, "reward_std": 0.6643698811531067, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.077880859375, "rewards/ppl_reward/std": 8.16186237335205, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07344620674848557, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 182.734375, "completions/mean_terminated_length": 141.3606414794922, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.4958879074017667, "grad_norm": 4.0307159423828125, "kl": 0.84375, "learning_rate": 1.980487804878049e-05, "loss": 0.3656, "num_tokens": 7414075.0, "reward": -2.9503173828125, "reward_std": 0.8378671407699585, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -9.728759765625, "rewards/ppl_reward/std": 4.545670509338379, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 190.3125, "completions/mean_terminated_length": 149.31146240234375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.4971063052086506, "grad_norm": 3.849708318710327, "kl": 1.0625, "learning_rate": 1.9853658536585368e-05, "loss": 0.27, "num_tokens": 7433183.0, "reward": -0.17889404296875, "reward_std": 0.379257470369339, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.1156005859375, "rewards/ppl_reward/std": 1.2053158283233643, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 188.921875, "completions/mean_terminated_length": 161.98387145996094, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.49832470301553455, "grad_norm": 6.2870330810546875, "kl": 1.2919921875, "learning_rate": 1.9902439024390247e-05, "loss": 0.4704, "num_tokens": 7452330.0, "reward": -1.607177734375, "reward_std": 0.622022807598114, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.87841796875, "rewards/ppl_reward/std": 8.986185073852539, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1302827149629593, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 152.27117919921875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.49954310082241854, "grad_norm": 6.998262882232666, "kl": 2.171875, "learning_rate": 1.9951219512195123e-05, "loss": 0.4337, "num_tokens": 7473442.0, "reward": -1.598876953125, "reward_std": 0.7931692004203796, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -6.58056640625, "rewards/ppl_reward/std": 3.7176811695098877, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.11566052585840225, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 185.453125, "completions/mean_terminated_length": 158.40321350097656, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.5007614986293024, "grad_norm": 3.841958522796631, "kl": 3.7890625, "learning_rate": 2e-05, "loss": 0.3311, "num_tokens": 7492767.0, "reward": -1.223876953125, "reward_std": 0.5479111671447754, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.90869140625, "rewards/ppl_reward/std": 2.715557098388672, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.10137632489204407, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 116.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.5019798964361865, "grad_norm": 5.489882946014404, "kl": 4.546875, "learning_rate": 1.9999996375759652e-05, "loss": 0.3689, "num_tokens": 7508639.0, "reward": -3.3743896484375, "reward_std": 1.0598646402359009, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5039526224136353, "rewards/ppl_reward/mean": -9.490966796875, "rewards/ppl_reward/std": 7.37631368637085, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.1666201651096344, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 116.13793182373047, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.5031982942430704, "grad_norm": 12.928796768188477, "kl": 13.421875, "learning_rate": 1.999998550304123e-05, "loss": 0.941, "num_tokens": 7528647.0, "reward": -1.4429931640625, "reward_std": 0.7557246088981628, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.48795005679130554, "rewards/ppl_reward/mean": -5.979736328125, "rewards/ppl_reward/std": 3.376675844192505, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.116794154047966, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 201.625, "completions/mean_terminated_length": 100.63157653808594, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.5044166920499543, "grad_norm": 7.797473430633545, "kl": 11.0625, "learning_rate": 1.999996738185262e-05, "loss": 0.8639, "num_tokens": 7548679.0, "reward": -1.6929931640625, "reward_std": 0.6312190890312195, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -6.768798828125, "rewards/ppl_reward/std": 3.1598827838897705, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.11566052585840225, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 186.0625, "completions/mean_terminated_length": 99.37931060791016, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.5056350898568382, "grad_norm": 16.802810668945312, "kl": 7.890625, "learning_rate": 1.9999942012206947e-05, "loss": 1.0383, "num_tokens": 7567507.0, "reward": -1.020263671875, "reward_std": 0.5000051259994507, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.55615234375, "rewards/ppl_reward/std": 1.8212486505508423, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 150.90625, "completions/mean_terminated_length": 92.70000457763672, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.5068534876637222, "grad_norm": 7.1255669593811035, "kl": 9.53125, "learning_rate": 1.999990939412261e-05, "loss": 0.8532, "num_tokens": 7583573.0, "reward": -2.08837890625, "reward_std": 0.7559540867805481, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -7.6689453125, "rewards/ppl_reward/std": 8.426098823547363, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10789459943771362, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 103.5, "completions/mean_terminated_length": 88.8888931274414, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.5080718854706061, "grad_norm": 8.495018005371094, "kl": 5.396484375, "learning_rate": 1.9999869527623253e-05, "loss": 0.1551, "num_tokens": 7596789.0, "reward": -2.0338134765625, "reward_std": 0.6118170022964478, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.872314453125, "rewards/ppl_reward/std": 3.486762046813965, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 138.21875, "completions/mean_terminated_length": 94.65573120117188, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.5092902832774902, "grad_norm": 4.545238971710205, "kl": 8.6015625, "learning_rate": 1.9999822412737767e-05, "loss": 0.7728, "num_tokens": 7612475.0, "reward": -1.8277587890625, "reward_std": 0.39396747946739197, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.444580078125, "rewards/ppl_reward/std": 4.927792072296143, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 151.484375, "completions/mean_terminated_length": 93.3166732788086, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.5105086810843741, "grad_norm": 14.356460571289062, "kl": 8.64453125, "learning_rate": 1.9999768049500304e-05, "loss": 1.0183, "num_tokens": 7628874.0, "reward": -2.149169921875, "reward_std": 0.43268832564353943, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.14208984375, "rewards/ppl_reward/std": 3.6606552600860596, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 93.25, "completions/mean_terminated_length": 93.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.511727078891258, "grad_norm": 2.4371232986450195, "kl": 1.505859375, "learning_rate": 1.999970643795027e-05, "loss": -0.0267, "num_tokens": 7641698.0, "reward": -1.66015625, "reward_std": 0.44853219389915466, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -7.25, "rewards/ppl_reward/std": 4.021434783935547, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 102.3125, "completions/mean_terminated_length": 102.3125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.5129454766981419, "grad_norm": 2.8761909008026123, "kl": 1.68359375, "learning_rate": 1.9999637578132328e-05, "loss": -0.0069, "num_tokens": 7655662.0, "reward": -1.0845947265625, "reward_std": 0.3459935188293457, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.098876953125, "rewards/ppl_reward/std": 3.494537353515625, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 91.203125, "completions/mean_terminated_length": 91.203125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5141638745050259, "grad_norm": 2.319767951965332, "kl": 1.142578125, "learning_rate": 1.9999561470096385e-05, "loss": 0.0015, "num_tokens": 7668107.0, "reward": -2.4849853515625, "reward_std": 0.35915374755859375, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -8.891845703125, "rewards/ppl_reward/std": 6.5759406089782715, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 106.625, "completions/mean_terminated_length": 106.625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.5153822723119098, "grad_norm": 1.9563497304916382, "kl": 1.1064453125, "learning_rate": 1.9999478113897614e-05, "loss": -0.0109, "num_tokens": 7681539.0, "reward": 0.0506591796875, "reward_std": 0.14224883913993835, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -3.898681640625, "rewards/ppl_reward/std": 1.540575385093689, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 128.59375, "completions/mean_terminated_length": 128.59375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.5166006701187937, "grad_norm": 1.885679841041565, "kl": 0.9033203125, "learning_rate": 1.999938750959643e-05, "loss": -0.0013, "num_tokens": 7697145.0, "reward": -1.0401611328125, "reward_std": 0.20932990312576294, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.080322265625, "rewards/ppl_reward/std": 4.8751654624938965, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 139.1875, "completions/mean_terminated_length": 139.1875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.5178190679256778, "grad_norm": 1.716955542564392, "kl": 0.9501953125, "learning_rate": 1.9999289657258505e-05, "loss": 0.045, "num_tokens": 7712885.0, "reward": -0.7320556640625, "reward_std": 0.1412947177886963, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.464111328125, "rewards/ppl_reward/std": 2.080648899078369, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 159.1875, "completions/mean_terminated_length": 159.1875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.5190374657325617, "grad_norm": 1.6274858713150024, "kl": 0.8466796875, "learning_rate": 1.9999184556954777e-05, "loss": 0.02, "num_tokens": 7730321.0, "reward": -1.860107421875, "reward_std": 0.5323836803436279, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.68115234375, "rewards/ppl_reward/std": 10.500838279724121, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 157.90625, "completions/mean_terminated_length": 157.90625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.5202558635394456, "grad_norm": 1.6832256317138672, "kl": 0.8291015625, "learning_rate": 1.9999072208761417e-05, "loss": 0.0021, "num_tokens": 7747339.0, "reward": -1.516845703125, "reward_std": 0.17928612232208252, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.03369140625, "rewards/ppl_reward/std": 3.6413230895996094, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 175.09375, "completions/mean_terminated_length": 175.09375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.5214742613463296, "grad_norm": 1.54912269115448, "kl": 0.7724609375, "learning_rate": 1.9998952612759868e-05, "loss": 0.0174, "num_tokens": 7765697.0, "reward": -0.5765380859375, "reward_std": 0.12070231139659882, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.153076171875, "rewards/ppl_reward/std": 1.9442212581634521, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 190.078125, "completions/mean_terminated_length": 190.078125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.5226926591532135, "grad_norm": 1.5099427700042725, "kl": 0.783203125, "learning_rate": 1.999882576903682e-05, "loss": 0.0308, "num_tokens": 7784958.0, "reward": -0.7205810546875, "reward_std": 0.2300763875246048, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.441162109375, "rewards/ppl_reward/std": 2.3126773834228516, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 199.125, "completions/mean_terminated_length": 199.125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.5239110569600974, "grad_norm": 1.4007370471954346, "kl": 0.7705078125, "learning_rate": 1.9998691677684204e-05, "loss": 0.0126, "num_tokens": 7804774.0, "reward": -0.3580322265625, "reward_std": 0.11780010163784027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.716064453125, "rewards/ppl_reward/std": 2.2476539611816406, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 227.203125, "completions/mean_terminated_length": 227.203125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.5251294547669815, "grad_norm": 1.4060800075531006, "kl": 0.76171875, "learning_rate": 1.999855033879923e-05, "loss": 0.0287, "num_tokens": 7827003.0, "reward": -2.1612548828125, "reward_std": 0.4776395559310913, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.127197265625, "rewards/ppl_reward/std": 4.79677152633667, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 226.453125, "completions/mean_terminated_length": 226.453125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.5263478525738654, "grad_norm": 1.646032691001892, "kl": 0.8046875, "learning_rate": 1.999840175248434e-05, "loss": 0.0174, "num_tokens": 7848720.0, "reward": -0.478271484375, "reward_std": 0.2662748694419861, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.87841796875, "rewards/ppl_reward/std": 4.063665866851807, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.5275662503807493, "grad_norm": 1.3906244039535522, "kl": 0.74609375, "learning_rate": 1.9998245918847234e-05, "loss": 0.0339, "num_tokens": 7874596.0, "reward": -1.3204345703125, "reward_std": 0.2531730532646179, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.523681640625, "rewards/ppl_reward/std": 4.098974227905273, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 295.78125, "completions/mean_terminated_length": 295.78125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.5287846481876333, "grad_norm": 1.5440813302993774, "kl": 0.7666015625, "learning_rate": 1.999808283800087e-05, "loss": 0.0787, "num_tokens": 7901358.0, "reward": -1.0794677734375, "reward_std": 0.24476779997348785, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.080810546875, "rewards/ppl_reward/std": 2.962571382522583, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 289.359375, "completions/mean_terminated_length": 289.359375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.5300030459945172, "grad_norm": 1.2462005615234375, "kl": 0.9326171875, "learning_rate": 1.999791251006346e-05, "loss": 0.0057, "num_tokens": 7927437.0, "reward": -0.731201171875, "reward_std": 0.22290171682834625, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.34521484375, "rewards/ppl_reward/std": 2.627723455429077, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 268.515625, "completions/mean_terminated_length": 268.515625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.5312214438014011, "grad_norm": 1.8105847835540771, "kl": 1.244140625, "learning_rate": 1.9997734935158468e-05, "loss": 0.0368, "num_tokens": 7952598.0, "reward": -1.27294921875, "reward_std": 1.2288861274719238, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.2490234375, "rewards/ppl_reward/std": 3.3386118412017822, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18617255985736847, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 265.671875, "completions/mean_terminated_length": 265.671875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.5324398416082851, "grad_norm": 1.8312350511550903, "kl": 1.4970703125, "learning_rate": 1.99975501134146e-05, "loss": 0.0811, "num_tokens": 7976769.0, "reward": -1.55859375, "reward_std": 0.889530599117279, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.7734375, "rewards/ppl_reward/std": 4.412378311157227, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14689241349697113, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 205.984375, "completions/mean_terminated_length": 205.984375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.5336582394151691, "grad_norm": 4.241653919219971, "kl": 2.171875, "learning_rate": 1.9997358044965833e-05, "loss": 0.0112, "num_tokens": 7997032.0, "reward": -2.2281494140625, "reward_std": 2.0723876953125, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.987548828125, "rewards/ppl_reward/std": 7.723668098449707, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2083333432674408, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 166.171875, "completions/mean_terminated_length": 166.171875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.534876637222053, "grad_norm": 2.27813720703125, "kl": 1.62890625, "learning_rate": 1.999715872995138e-05, "loss": 0.0722, "num_tokens": 8014059.0, "reward": -2.2294921875, "reward_std": 1.1706180572509766, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -8.365234375, "rewards/ppl_reward/std": 7.974849700927734, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 158.65625, "completions/mean_terminated_length": 158.65625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.536095035028937, "grad_norm": 1.6270854473114014, "kl": 0.841796875, "learning_rate": 1.9996952168515715e-05, "loss": 0.0424, "num_tokens": 8030709.0, "reward": -2.2803955078125, "reward_std": 0.6993407011032104, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.560791015625, "rewards/ppl_reward/std": 11.252034187316895, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 145.875, "completions/mean_terminated_length": 145.875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.5373134328358209, "grad_norm": 1.6906601190567017, "kl": 1.1083984375, "learning_rate": 1.9996738360808566e-05, "loss": -0.0163, "num_tokens": 8047269.0, "reward": -2.30712890625, "reward_std": 0.5533499121665955, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -8.5439453125, "rewards/ppl_reward/std": 7.83029317855835, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 116.546875, "completions/mean_terminated_length": 116.546875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.5385318306427048, "grad_norm": 2.5749547481536865, "kl": 1.302734375, "learning_rate": 1.9996517306984914e-05, "loss": 0.0185, "num_tokens": 8060712.0, "reward": -1.733154296875, "reward_std": 0.874397337436676, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.13037109375, "rewards/ppl_reward/std": 3.9280812740325928, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 115.9375, "completions/mean_terminated_length": 115.9375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.5397502284495888, "grad_norm": 1.7842499017715454, "kl": 0.7958984375, "learning_rate": 1.999628900720498e-05, "loss": -0.0805, "num_tokens": 8074828.0, "reward": -1.2645263671875, "reward_std": 0.2056039720773697, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.497802734375, "rewards/ppl_reward/std": 5.277601718902588, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 140.09375, "completions/mean_terminated_length": 140.09375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.5409686262564727, "grad_norm": 1.5725691318511963, "kl": 0.7060546875, "learning_rate": 1.999605346163426e-05, "loss": -0.0231, "num_tokens": 8090866.0, "reward": -1.513427734375, "reward_std": 0.2652665972709656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.02685546875, "rewards/ppl_reward/std": 2.667462110519409, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 142.5, "completions/mean_terminated_length": 142.5, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.5421870240633567, "grad_norm": 1.533540964126587, "kl": 0.671875, "learning_rate": 1.9995810670443474e-05, "loss": -0.0271, "num_tokens": 8106898.0, "reward": -2.24951171875, "reward_std": 0.6490342020988464, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.4990234375, "rewards/ppl_reward/std": 9.511281967163086, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 153.5625, "completions/mean_terminated_length": 153.5625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.5434054218702407, "grad_norm": 1.4145058393478394, "kl": 0.7001953125, "learning_rate": 1.999556063380862e-05, "loss": -0.0593, "num_tokens": 8123622.0, "reward": -1.018310546875, "reward_std": 0.15019918978214264, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.03662109375, "rewards/ppl_reward/std": 4.493890285491943, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 170.59375, "completions/mean_terminated_length": 170.59375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.5446238196771246, "grad_norm": 1.5564899444580078, "kl": 0.6953125, "learning_rate": 1.9995303351910934e-05, "loss": 0.0225, "num_tokens": 8141460.0, "reward": -0.1507568359375, "reward_std": 0.147216334939003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.301513671875, "rewards/ppl_reward/std": 1.5934453010559082, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 198.453125, "completions/mean_terminated_length": 198.453125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.5458422174840085, "grad_norm": 1.2346314191818237, "kl": 0.669921875, "learning_rate": 1.99950388249369e-05, "loss": 0.0286, "num_tokens": 8160913.0, "reward": -0.87109375, "reward_std": 0.20626169443130493, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.7421875, "rewards/ppl_reward/std": 3.74143123626709, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 221.375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.5470606152908924, "grad_norm": 1.1545137166976929, "kl": 0.63671875, "learning_rate": 1.999476705307827e-05, "loss": -0.0246, "num_tokens": 8182257.0, "reward": -0.6298828125, "reward_std": 0.13163116574287415, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.259765625, "rewards/ppl_reward/std": 1.717194676399231, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 250.71875, "completions/mean_terminated_length": 250.71875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.5482790130977764, "grad_norm": 2.584360361099243, "kl": 0.73828125, "learning_rate": 1.999448803653203e-05, "loss": -0.0594, "num_tokens": 8204967.0, "reward": -2.02606201171875, "reward_std": 0.45006585121154785, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -8.0052490234375, "rewards/ppl_reward/std": 6.615805625915527, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 250.390625, "completions/mean_terminated_length": 250.390625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.5494974109046604, "grad_norm": 1.4379503726959229, "kl": 0.6923828125, "learning_rate": 1.999420177550043e-05, "loss": -0.006, "num_tokens": 8227976.0, "reward": -2.3321533203125, "reward_std": 0.1668241322040558, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.179931640625, "rewards/ppl_reward/std": 2.244145154953003, "rewards/tag_count_reward/mean": 0.2578125, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 308.734375, "completions/mean_terminated_length": 308.734375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.5507158087115444, "grad_norm": 1.1387797594070435, "kl": 0.62890625, "learning_rate": 1.999390827019096e-05, "loss": -0.0129, "num_tokens": 8254919.0, "reward": -3.8121337890625, "reward_std": 0.3584495484828949, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.108642578125, "rewards/ppl_reward/std": 10.342329978942871, "rewards/tag_count_reward/mean": 0.2421875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 410.265625, "completions/mean_terminated_length": 380.0819396972656, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.5519342065184283, "grad_norm": 1.5541712045669556, "kl": 0.5703125, "learning_rate": 1.9993607520816368e-05, "loss": 0.222, "num_tokens": 8288520.0, "reward": -3.40399169921875, "reward_std": 1.1298130750656128, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.9798583984375, "rewards/ppl_reward/std": 5.3835039138793945, "rewards/tag_count_reward/mean": 0.0859375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 346.015625, "completions/mean_terminated_length": 346.015625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.5531526043253122, "grad_norm": 1.0849790573120117, "kl": 0.58984375, "learning_rate": 1.9993299527594655e-05, "loss": 0.0623, "num_tokens": 8317281.0, "reward": -2.152587890625, "reward_std": 0.43475547432899475, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.79736328125, "rewards/ppl_reward/std": 1.965294361114502, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.07031939178705215, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 360.59375, "completions/mean_terminated_length": 360.59375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.5543710021321961, "grad_norm": 0.9884652495384216, "kl": 0.5751953125, "learning_rate": 1.9992984290749066e-05, "loss": 0.0501, "num_tokens": 8347775.0, "reward": -2.477294921875, "reward_std": 0.1429985761642456, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.45458984375, "rewards/ppl_reward/std": 3.209697723388672, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 354.75, "completions/mean_terminated_length": 354.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.5555893999390801, "grad_norm": 1.1914674043655396, "kl": 0.6298828125, "learning_rate": 1.99926618105081e-05, "loss": -0.0116, "num_tokens": 8377127.0, "reward": -3.18798828125, "reward_std": 0.1987016797065735, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.8759765625, "rewards/ppl_reward/std": 5.553603649139404, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 339.375, "completions/mean_terminated_length": 339.375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.556807797745964, "grad_norm": 1.254350185394287, "kl": 0.58984375, "learning_rate": 1.999233208710551e-05, "loss": 0.0354, "num_tokens": 8405303.0, "reward": -3.109375, "reward_std": 0.33859187364578247, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.71875, "rewards/ppl_reward/std": 4.166875839233398, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 328.453125, "completions/mean_terminated_length": 328.453125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.558026195552848, "grad_norm": 1.1274100542068481, "kl": 0.615234375, "learning_rate": 1.9991995120780292e-05, "loss": 0.006, "num_tokens": 8432724.0, "reward": -2.21051025390625, "reward_std": 0.18998858332633972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.9210205078125, "rewards/ppl_reward/std": 2.1950581073760986, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 348.625, "completions/mean_terminated_length": 348.625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.559244593359732, "grad_norm": 1.0031825304031372, "kl": 0.59765625, "learning_rate": 1.9991650911776697e-05, "loss": -0.0517, "num_tokens": 8461716.0, "reward": -3.2679443359375, "reward_std": 0.5494484305381775, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.035888671875, "rewards/ppl_reward/std": 8.623903274536133, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 385.578125, "completions/mean_terminated_length": 385.578125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.5604629911666159, "grad_norm": 0.997236967086792, "kl": 0.564453125, "learning_rate": 1.999129946034422e-05, "loss": 0.0287, "num_tokens": 8493497.0, "reward": -2.14337158203125, "reward_std": 0.12977126240730286, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.7867431640625, "rewards/ppl_reward/std": 3.711747646331787, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 372.828125, "completions/mean_terminated_length": 372.828125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.5616813889734998, "grad_norm": 0.9742128849029541, "kl": 0.5859375, "learning_rate": 1.9990940766737617e-05, "loss": 0.012, "num_tokens": 8524502.0, "reward": -1.8680419921875, "reward_std": 0.1758880466222763, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.236083984375, "rewards/ppl_reward/std": 1.705202579498291, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 392.25, "completions/mean_terminated_length": 392.25, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.5628997867803838, "grad_norm": 0.9992477893829346, "kl": 0.5400390625, "learning_rate": 1.9990574831216877e-05, "loss": 0.0111, "num_tokens": 8557286.0, "reward": -3.0789794921875, "reward_std": 0.23572970926761627, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.657958984375, "rewards/ppl_reward/std": 2.091980218887329, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 364.25, "completions/mean_terminated_length": 364.25, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.5641181845872677, "grad_norm": 0.8947311043739319, "kl": 0.5693359375, "learning_rate": 1.9990201654047258e-05, "loss": -0.0171, "num_tokens": 8588374.0, "reward": -2.49420166015625, "reward_std": 0.21187236905097961, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.4805908203125, "rewards/ppl_reward/std": 3.34059739112854, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 360.90625, "completions/mean_terminated_length": 360.90625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.5653365823941517, "grad_norm": 1.0756645202636719, "kl": 0.5517578125, "learning_rate": 1.998982123549925e-05, "loss": -0.0673, "num_tokens": 8618504.0, "reward": -3.77215576171875, "reward_std": 0.2443998008966446, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.0364990234375, "rewards/ppl_reward/std": 7.501314640045166, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 385.625, "completions/mean_terminated_length": 385.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5665549802010357, "grad_norm": 1.1093961000442505, "kl": 0.5458984375, "learning_rate": 1.99894335758486e-05, "loss": -0.0078, "num_tokens": 8650488.0, "reward": -2.2825927734375, "reward_std": 0.1942533552646637, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.065185546875, "rewards/ppl_reward/std": 2.6778223514556885, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 374.953125, "completions/mean_terminated_length": 374.953125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.5677733780079196, "grad_norm": 1.1389786005020142, "kl": 0.564453125, "learning_rate": 1.9989038675376305e-05, "loss": -0.0384, "num_tokens": 8681053.0, "reward": -7.2059326171875, "reward_std": 0.9005177021026611, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -14.911865234375, "rewards/ppl_reward/std": 24.690048217773438, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 374.59375, "completions/mean_terminated_length": 374.59375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.5689917758148035, "grad_norm": 1.1070278882980347, "kl": 0.568359375, "learning_rate": 1.9988636534368603e-05, "loss": 0.0727, "num_tokens": 8711595.0, "reward": -2.44580078125, "reward_std": 0.16807842254638672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.3916015625, "rewards/ppl_reward/std": 2.017864227294922, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 372.140625, "completions/mean_terminated_length": 372.140625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.5702101736216875, "grad_norm": 0.9669952392578125, "kl": 0.5625, "learning_rate": 1.998822715311699e-05, "loss": -0.0125, "num_tokens": 8742004.0, "reward": -5.37109375, "reward_std": 0.41448819637298584, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -11.234375, "rewards/ppl_reward/std": 7.464670181274414, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 324.765625, "completions/mean_terminated_length": 313.66668701171875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.5714285714285714, "grad_norm": 1.2119388580322266, "kl": 0.572265625, "learning_rate": 1.99878105319182e-05, "loss": 0.0911, "num_tokens": 8769485.0, "reward": -3.7406005859375, "reward_std": 0.4390832185745239, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.981201171875, "rewards/ppl_reward/std": 5.485162734985352, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 255.171875, "completions/mean_terminated_length": 255.171875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.5726469692354553, "grad_norm": 1.3167110681533813, "kl": 0.630859375, "learning_rate": 1.9987386671074225e-05, "loss": -0.0963, "num_tokens": 8792600.0, "reward": -5.93927001953125, "reward_std": 0.3911377787590027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -12.3707275390625, "rewards/ppl_reward/std": 15.1445894241333, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 229.484375, "completions/mean_terminated_length": 229.484375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.5738653670423394, "grad_norm": 1.0911709070205688, "kl": 0.6318359375, "learning_rate": 1.9986955570892302e-05, "loss": -0.0049, "num_tokens": 8814575.0, "reward": -2.80224609375, "reward_std": 0.2927440106868744, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.0966796875, "rewards/ppl_reward/std": 3.2117836475372314, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 216.796875, "completions/mean_terminated_length": 216.796875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.5750837648492233, "grad_norm": 1.2651411294937134, "kl": 0.609375, "learning_rate": 1.9986517231684903e-05, "loss": -0.0269, "num_tokens": 8835962.0, "reward": -2.96142578125, "reward_std": 0.23818224668502808, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.4072265625, "rewards/ppl_reward/std": 2.274170398712158, "rewards/tag_count_reward/mean": 0.2421875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 161.359375, "completions/mean_terminated_length": 161.359375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5763021626561072, "grad_norm": 1.2902157306671143, "kl": 0.712890625, "learning_rate": 1.998607165376977e-05, "loss": -0.0954, "num_tokens": 8853129.0, "reward": -2.7259521484375, "reward_std": 0.22156161069869995, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.951904296875, "rewards/ppl_reward/std": 2.2956275939941406, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 188.609375, "completions/mean_terminated_length": 188.609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5775205604629912, "grad_norm": 1.437052607536316, "kl": 0.7373046875, "learning_rate": 1.9985618837469864e-05, "loss": -0.0044, "num_tokens": 8872400.0, "reward": -2.798095703125, "reward_std": 0.21496078372001648, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.09619140625, "rewards/ppl_reward/std": 4.647305011749268, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 171.8125, "completions/mean_terminated_length": 171.8125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.5787389582698751, "grad_norm": 1.3464728593826294, "kl": 0.71484375, "learning_rate": 1.9985158783113423e-05, "loss": -0.0115, "num_tokens": 8890196.0, "reward": -2.000244140625, "reward_std": 0.1894838511943817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.50048828125, "rewards/ppl_reward/std": 1.5576759576797485, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 153.90625, "completions/mean_terminated_length": 153.90625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.579957356076759, "grad_norm": 1.5124928951263428, "kl": 0.734375, "learning_rate": 1.9984691491033908e-05, "loss": -0.0751, "num_tokens": 8907982.0, "reward": -2.500244140625, "reward_std": 0.12680655717849731, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.49267578125, "rewards/ppl_reward/std": 2.1046011447906494, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 142.21875, "completions/mean_terminated_length": 142.21875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.5811757538836431, "grad_norm": 1.5328861474990845, "kl": 0.7939453125, "learning_rate": 1.9984216961570038e-05, "loss": 0.0006, "num_tokens": 8923988.0, "reward": -2.768798828125, "reward_std": 0.18273955583572388, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.02978515625, "rewards/ppl_reward/std": 1.2839200496673584, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 172.765625, "completions/mean_terminated_length": 172.765625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.582394151690527, "grad_norm": 1.6911970376968384, "kl": 0.7373046875, "learning_rate": 1.998373519506577e-05, "loss": 0.0497, "num_tokens": 8942173.0, "reward": -3.173828125, "reward_std": 0.2164883017539978, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.84765625, "rewards/ppl_reward/std": 3.377023935317993, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 170.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.5836125494974109, "grad_norm": 1.8390294313430786, "kl": 0.7724609375, "learning_rate": 1.9983246191870318e-05, "loss": 0.037, "num_tokens": 8959765.0, "reward": -2.84320068359375, "reward_std": 0.2547873258590698, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.1864013671875, "rewards/ppl_reward/std": 2.569812536239624, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.5848309473042949, "grad_norm": 1.4423696994781494, "kl": 0.7431640625, "learning_rate": 1.998274995233813e-05, "loss": -0.0106, "num_tokens": 8977405.0, "reward": -4.42828369140625, "reward_std": 0.4192717969417572, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.3565673828125, "rewards/ppl_reward/std": 6.672569751739502, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 195.578125, "completions/mean_terminated_length": 195.578125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.5860493451111788, "grad_norm": 1.2117096185684204, "kl": 0.697265625, "learning_rate": 1.998224647682891e-05, "loss": -0.0469, "num_tokens": 8997082.0, "reward": -2.99169921875, "reward_std": 0.14330711960792542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.4833984375, "rewards/ppl_reward/std": 3.314422369003296, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 183.296875, "completions/mean_terminated_length": 183.296875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.5872677429180627, "grad_norm": 1.3986477851867676, "kl": 0.6962890625, "learning_rate": 1.9981735765707597e-05, "loss": 0.0167, "num_tokens": 9016301.0, "reward": -1.966064453125, "reward_std": 0.09845587611198425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.43212890625, "rewards/ppl_reward/std": 1.362918734550476, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 177.328125, "completions/mean_terminated_length": 177.328125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.5884861407249466, "grad_norm": 1.503017783164978, "kl": 0.748046875, "learning_rate": 1.998121781934438e-05, "loss": 0.0089, "num_tokens": 9034354.0, "reward": -3.0596923828125, "reward_std": 0.26345986127853394, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.619384765625, "rewards/ppl_reward/std": 5.422893524169922, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 206.125, "completions/mean_terminated_length": 206.125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.5897045385318307, "grad_norm": 1.3058661222457886, "kl": 0.68359375, "learning_rate": 1.9980692638114688e-05, "loss": 0.1135, "num_tokens": 9054442.0, "reward": -8.516845703125, "reward_std": 2.0832223892211914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -17.53369140625, "rewards/ppl_reward/std": 28.855640411376953, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 217.71875, "completions/mean_terminated_length": 217.71875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.5909229363387146, "grad_norm": 1.242440938949585, "kl": 0.650390625, "learning_rate": 1.9980160222399206e-05, "loss": -0.0784, "num_tokens": 9075544.0, "reward": -3.6102294921875, "reward_std": 0.4149617552757263, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.712646484375, "rewards/ppl_reward/std": 6.356925010681152, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 209.359375, "completions/mean_terminated_length": 209.359375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.5921413341455986, "grad_norm": 1.1960666179656982, "kl": 0.7119140625, "learning_rate": 1.9979620572583846e-05, "loss": -0.1329, "num_tokens": 9095631.0, "reward": -6.3983154296875, "reward_std": 0.6382162570953369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -13.296630859375, "rewards/ppl_reward/std": 15.413671493530273, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 238.15625, "completions/mean_terminated_length": 238.15625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.5933597319524825, "grad_norm": 1.1422749757766724, "kl": 0.654296875, "learning_rate": 1.9979073689059777e-05, "loss": 0.0216, "num_tokens": 9117849.0, "reward": -2.237060546875, "reward_std": 0.10562708973884583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.97412109375, "rewards/ppl_reward/std": 1.4445544481277466, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 251.390625, "completions/mean_terminated_length": 251.390625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.5945781297593664, "grad_norm": 1.0635483264923096, "kl": 0.634765625, "learning_rate": 1.99785195722234e-05, "loss": -0.0406, "num_tokens": 9141746.0, "reward": -2.0828857421875, "reward_std": 0.1479509174823761, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.665771484375, "rewards/ppl_reward/std": 2.4261820316314697, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 274.8125, "completions/mean_terminated_length": 274.8125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.5957965275662503, "grad_norm": 1.2176910638809204, "kl": 0.6865234375, "learning_rate": 1.9977958222476374e-05, "loss": -0.0342, "num_tokens": 9166662.0, "reward": -3.130615234375, "reward_std": 0.32018375396728516, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.76123046875, "rewards/ppl_reward/std": 3.4513490200042725, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 296.03125, "completions/mean_terminated_length": 296.03125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.5970149253731343, "grad_norm": 1.5358610153198242, "kl": 0.65625, "learning_rate": 1.9977389640225587e-05, "loss": 0.0412, "num_tokens": 9192112.0, "reward": -2.05810546875, "reward_std": 0.15640521049499512, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.6162109375, "rewards/ppl_reward/std": 2.516029119491577, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 265.890625, "completions/mean_terminated_length": 265.890625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.5982333231800183, "grad_norm": 1.5124924182891846, "kl": 0.7265625, "learning_rate": 1.9976813825883182e-05, "loss": 0.0507, "num_tokens": 9215601.0, "reward": -2.3204345703125, "reward_std": 0.19024071097373962, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.125244140625, "rewards/ppl_reward/std": 2.2708699703216553, "rewards/tag_count_reward/mean": 0.2421875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 210.921875, "completions/mean_terminated_length": 210.921875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.5994517209869022, "grad_norm": 1.9607492685317993, "kl": 0.9033203125, "learning_rate": 1.9976230779866527e-05, "loss": 0.1071, "num_tokens": 9236724.0, "reward": -3.24169921875, "reward_std": 0.4472050666809082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.9833984375, "rewards/ppl_reward/std": 3.851651906967163, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 178.296875, "completions/mean_terminated_length": 178.296875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.6006701187937862, "grad_norm": 2.4386281967163086, "kl": 2.123046875, "learning_rate": 1.9975640502598243e-05, "loss": 0.0995, "num_tokens": 9255415.0, "reward": -4.05810546875, "reward_std": 0.3430749177932739, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.5927734375, "rewards/ppl_reward/std": 5.079183101654053, "rewards/tag_count_reward/mean": 0.23828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 161.421875, "completions/mean_terminated_length": 161.421875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6018885166006701, "grad_norm": 4.169076442718506, "kl": 3.171875, "learning_rate": 1.9975042994506197e-05, "loss": 0.0993, "num_tokens": 9273234.0, "reward": -2.6617431640625, "reward_std": 0.3721253275871277, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.815673828125, "rewards/ppl_reward/std": 2.302927017211914, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 143.234375, "completions/mean_terminated_length": 143.234375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.603106914407554, "grad_norm": 5.2758469581604, "kl": 4.29296875, "learning_rate": 1.997443825602349e-05, "loss": 0.2212, "num_tokens": 9288833.0, "reward": -2.7178955078125, "reward_std": 0.34505903720855713, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.920166015625, "rewards/ppl_reward/std": 2.9443812370300293, "rewards/tag_count_reward/mean": 0.2421875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 154.78125, "completions/mean_terminated_length": 154.78125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.604325312214438, "grad_norm": 1.8651466369628906, "kl": 2.216796875, "learning_rate": 1.9973826287588465e-05, "loss": 0.0722, "num_tokens": 9305611.0, "reward": -2.47613525390625, "reward_std": 0.24109522998332977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.4522705078125, "rewards/ppl_reward/std": 3.5324037075042725, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 156.03125, "completions/mean_terminated_length": 156.03125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.605543710021322, "grad_norm": 1.4935716390609741, "kl": 1.26171875, "learning_rate": 1.9973207089644705e-05, "loss": 0.0747, "num_tokens": 9322573.0, "reward": -1.6763916015625, "reward_std": 0.27144503593444824, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -3.844970703125, "rewards/ppl_reward/std": 1.9081240892410278, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 163.15625, "completions/mean_terminated_length": 163.15625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.6067621078282059, "grad_norm": 2.0908427238464355, "kl": 1.03515625, "learning_rate": 1.9972580662641035e-05, "loss": -0.0937, "num_tokens": 9340815.0, "reward": -5.423583984375, "reward_std": 0.44735753536224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -11.34716796875, "rewards/ppl_reward/std": 10.629142761230469, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 192.3125, "completions/mean_terminated_length": 192.3125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.6079805056350899, "grad_norm": 1.704950213432312, "kl": 0.875, "learning_rate": 1.997194700703152e-05, "loss": 0.0612, "num_tokens": 9359883.0, "reward": -4.6181640625, "reward_std": 0.4635962247848511, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.728515625, "rewards/ppl_reward/std": 5.64576530456543, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 221.390625, "completions/mean_terminated_length": 221.390625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.6091989034419738, "grad_norm": 2.309340238571167, "kl": 0.7783203125, "learning_rate": 1.997130612327546e-05, "loss": -0.0967, "num_tokens": 9381284.0, "reward": -2.787353515625, "reward_std": 0.19559502601623535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.06689453125, "rewards/ppl_reward/std": 5.809889793395996, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 267.640625, "completions/mean_terminated_length": 267.640625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.6104173012488577, "grad_norm": 1.3210164308547974, "kl": 0.7587890625, "learning_rate": 1.9970658011837404e-05, "loss": -0.0708, "num_tokens": 9406469.0, "reward": -3.006103515625, "reward_std": 0.300824910402298, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.50439453125, "rewards/ppl_reward/std": 2.3653314113616943, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.05441221967339516, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 253.0625, "completions/mean_terminated_length": 253.0625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6116356990557417, "grad_norm": 1.100269079208374, "kl": 0.7470703125, "learning_rate": 1.9970002673187135e-05, "loss": 0.0142, "num_tokens": 9429785.0, "reward": -2.84033203125, "reward_std": 0.17625628411769867, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.1884765625, "rewards/ppl_reward/std": 3.0743625164031982, "rewards/tag_count_reward/mean": 0.25390625, "rewards/tag_count_reward/std": 0.03125, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 255.296875, "completions/mean_terminated_length": 255.296875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.6128540968626256, "grad_norm": 0.9571599960327148, "kl": 0.69140625, "learning_rate": 1.996934010779967e-05, "loss": -0.0311, "num_tokens": 9453556.0, "reward": -3.115966796875, "reward_std": 0.21017290651798248, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.73193359375, "rewards/ppl_reward/std": 3.845730781555176, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 256.78125, "completions/mean_terminated_length": 256.78125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.6140724946695096, "grad_norm": 1.0489975214004517, "kl": 0.705078125, "learning_rate": 1.9968670316155266e-05, "loss": 0.1077, "num_tokens": 9477582.0, "reward": -4.3758544921875, "reward_std": 0.2606605887413025, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.251708984375, "rewards/ppl_reward/std": 4.205892562866211, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 275.265625, "completions/mean_terminated_length": 275.265625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.6152908924763936, "grad_norm": 0.9345181584358215, "kl": 0.703125, "learning_rate": 1.9967993298739427e-05, "loss": -0.0332, "num_tokens": 9502367.0, "reward": -2.42724609375, "reward_std": 0.14550170302391052, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.3544921875, "rewards/ppl_reward/std": 1.8188813924789429, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 291.328125, "completions/mean_terminated_length": 291.328125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.6165092902832775, "grad_norm": 0.9607383608818054, "kl": 0.6474609375, "learning_rate": 1.9967309056042884e-05, "loss": 0.0131, "num_tokens": 9528900.0, "reward": -3.42413330078125, "reward_std": 0.19824090600013733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.3482666015625, "rewards/ppl_reward/std": 5.087908744812012, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 289.828125, "completions/mean_terminated_length": 289.828125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.6177276880901614, "grad_norm": 0.924977719783783, "kl": 0.6572265625, "learning_rate": 1.996661758856161e-05, "loss": -0.0315, "num_tokens": 9554729.0, "reward": -3.334716796875, "reward_std": 0.20971621572971344, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.16943359375, "rewards/ppl_reward/std": 5.287108421325684, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 248.765625, "completions/mean_terminated_length": 248.765625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.6189460858970454, "grad_norm": 0.9580579400062561, "kl": 0.6953125, "learning_rate": 1.996591889679681e-05, "loss": -0.0801, "num_tokens": 9577322.0, "reward": -3.68212890625, "reward_std": 0.37032458186149597, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.8642578125, "rewards/ppl_reward/std": 3.8021280765533447, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 298.125, "completions/mean_terminated_length": 298.125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.6201644837039293, "grad_norm": 0.9992055892944336, "kl": 0.6669921875, "learning_rate": 1.9965212981254936e-05, "loss": -0.0262, "num_tokens": 9603634.0, "reward": -2.49658203125, "reward_std": 0.17137186229228973, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.4931640625, "rewards/ppl_reward/std": 4.008516311645508, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 267.328125, "completions/mean_terminated_length": 267.328125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.6213828815108133, "grad_norm": 0.9479066729545593, "kl": 0.6669921875, "learning_rate": 1.9964499842447665e-05, "loss": 0.049, "num_tokens": 9628015.0, "reward": -3.537353515625, "reward_std": 0.28599950671195984, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.57470703125, "rewards/ppl_reward/std": 3.964578151702881, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 254.15625, "completions/mean_terminated_length": 254.15625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.6226012793176973, "grad_norm": 1.005910038948059, "kl": 0.7265625, "learning_rate": 1.9963779480891917e-05, "loss": 0.0104, "num_tokens": 9651057.0, "reward": -2.650390625, "reward_std": 0.17157897353172302, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.80078125, "rewards/ppl_reward/std": 2.1226608753204346, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 295.84375, "completions/mean_terminated_length": 295.84375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.6238196771245812, "grad_norm": 0.9136443734169006, "kl": 0.6884765625, "learning_rate": 1.996305189710984e-05, "loss": -0.133, "num_tokens": 9677015.0, "reward": -4.07763671875, "reward_std": 0.4098285436630249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.6552734375, "rewards/ppl_reward/std": 12.488410949707031, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 349.515625, "completions/mean_terminated_length": 338.8095397949219, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.6250380749314651, "grad_norm": 0.8668456673622131, "kl": 0.6064453125, "learning_rate": 1.9962317091628826e-05, "loss": 0.0585, "num_tokens": 9706648.0, "reward": -3.281982421875, "reward_std": 0.346537709236145, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.06396484375, "rewards/ppl_reward/std": 4.750816345214844, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 354.421875, "completions/mean_terminated_length": 354.421875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.6262564727383491, "grad_norm": 0.9505347609519958, "kl": 0.65234375, "learning_rate": 1.9961575064981495e-05, "loss": -0.1645, "num_tokens": 9735771.0, "reward": -2.94195556640625, "reward_std": 0.2849769592285156, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.3839111328125, "rewards/ppl_reward/std": 5.967130184173584, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 384.03125, "completions/mean_terminated_length": 384.03125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.627474870545233, "grad_norm": 0.787604570388794, "kl": 0.60546875, "learning_rate": 1.9960825817705708e-05, "loss": -0.0568, "num_tokens": 9767837.0, "reward": -2.3992919921875, "reward_std": 0.17390313744544983, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.298583984375, "rewards/ppl_reward/std": 2.263906478881836, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 448.796875, "completions/mean_terminated_length": 410.45001220703125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.6286932683521169, "grad_norm": 0.9829006195068359, "kl": 0.5791015625, "learning_rate": 1.9960069350344547e-05, "loss": 0.2201, "num_tokens": 9803320.0, "reward": -2.47857666015625, "reward_std": 0.41545164585113525, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.4571533203125, "rewards/ppl_reward/std": 3.2664246559143066, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 420.515625, "completions/mean_terminated_length": 401.0483703613281, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.629911666159001, "grad_norm": 1.025153398513794, "kl": 0.595703125, "learning_rate": 1.9959305663446346e-05, "loss": 0.1215, "num_tokens": 9837729.0, "reward": -8.7802734375, "reward_std": 5.599163055419922, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -18.060546875, "rewards/ppl_reward/std": 45.57761764526367, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 349.671875, "completions/mean_terminated_length": 349.671875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.6311300639658849, "grad_norm": 0.9213900566101074, "kl": 0.642578125, "learning_rate": 1.9958534757564657e-05, "loss": -0.0484, "num_tokens": 9867052.0, "reward": -2.803955078125, "reward_std": 0.2543116807937622, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.10791015625, "rewards/ppl_reward/std": 4.288301944732666, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 334.546875, "completions/mean_terminated_length": 334.546875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.6323484617727688, "grad_norm": 0.9092274904251099, "kl": 0.634765625, "learning_rate": 1.9957756633258264e-05, "loss": -0.0207, "num_tokens": 9895031.0, "reward": -2.6439208984375, "reward_std": 0.20977750420570374, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.780029296875, "rewards/ppl_reward/std": 1.3099651336669922, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 290.765625, "completions/mean_terminated_length": 279.12701416015625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.6335668595796528, "grad_norm": 1.218122959136963, "kl": 0.68359375, "learning_rate": 1.9956971291091203e-05, "loss": 0.0711, "num_tokens": 9920672.0, "reward": -3.8182373046875, "reward_std": 0.37829041481018066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.136474609375, "rewards/ppl_reward/std": 8.587712287902832, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 280.859375, "completions/mean_terminated_length": 280.859375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.6347852573865367, "grad_norm": 0.9677658081054688, "kl": 0.6923828125, "learning_rate": 1.9956178731632715e-05, "loss": -0.0081, "num_tokens": 9945847.0, "reward": -2.3270263671875, "reward_std": 0.13483880460262299, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.154052734375, "rewards/ppl_reward/std": 3.0099942684173584, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 242.0625, "completions/mean_terminated_length": 242.0625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6360036551934206, "grad_norm": 1.2001187801361084, "kl": 0.6767578125, "learning_rate": 1.9955378955457286e-05, "loss": -0.0386, "num_tokens": 9969203.0, "reward": -2.4600830078125, "reward_std": 0.1376350224018097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.420166015625, "rewards/ppl_reward/std": 2.8806707859039307, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 231.6875, "completions/mean_terminated_length": 231.6875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.6372220530003045, "grad_norm": 1.152580738067627, "kl": 0.6982421875, "learning_rate": 1.995457196314464e-05, "loss": -0.0096, "num_tokens": 9991071.0, "reward": -2.0128173828125, "reward_std": 0.20992280542850494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.525634765625, "rewards/ppl_reward/std": 1.5673223733901978, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 201.890625, "completions/mean_terminated_length": 201.890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6384404508071886, "grad_norm": 1.266250491142273, "kl": 0.744140625, "learning_rate": 1.995375775527972e-05, "loss": -0.1187, "num_tokens": 10010464.0, "reward": -1.71173095703125, "reward_std": 0.1467132419347763, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -3.9234619140625, "rewards/ppl_reward/std": 1.36470627784729, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 245.0625, "completions/mean_terminated_length": 245.0625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.6396588486140725, "grad_norm": 1.313461184501648, "kl": 0.6533203125, "learning_rate": 1.99529363324527e-05, "loss": -0.063, "num_tokens": 10032548.0, "reward": -2.56854248046875, "reward_std": 0.18606168031692505, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.6370849609375, "rewards/ppl_reward/std": 5.306604862213135, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 242.234375, "completions/mean_terminated_length": 242.234375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.6408772464209564, "grad_norm": 1.2615362405776978, "kl": 0.6611328125, "learning_rate": 1.9952107695258993e-05, "loss": -0.0725, "num_tokens": 10054739.0, "reward": -2.9730224609375, "reward_std": 0.2757347822189331, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.446044921875, "rewards/ppl_reward/std": 3.6736862659454346, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 276.109375, "completions/mean_terminated_length": 276.109375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.6420956442278404, "grad_norm": 1.1248856782913208, "kl": 0.640625, "learning_rate": 1.9951271844299227e-05, "loss": 0.0577, "num_tokens": 10079738.0, "reward": -3.806884765625, "reward_std": 0.27967166900634766, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.11376953125, "rewards/ppl_reward/std": 5.7376389503479, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 267.421875, "completions/mean_terminated_length": 267.421875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.6433140420347243, "grad_norm": 1.1118272542953491, "kl": 0.638671875, "learning_rate": 1.9950428780179274e-05, "loss": -0.0316, "num_tokens": 10103493.0, "reward": -3.9091796875, "reward_std": 0.31273409724235535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.310546875, "rewards/ppl_reward/std": 8.705257415771484, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 244.625, "completions/mean_terminated_length": 244.625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.6445324398416082, "grad_norm": 1.070258617401123, "kl": 0.6796875, "learning_rate": 1.9949578503510224e-05, "loss": -0.0779, "num_tokens": 10126485.0, "reward": -2.4398193359375, "reward_std": 0.1588718146085739, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.371826171875, "rewards/ppl_reward/std": 2.190311908721924, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 267.828125, "completions/mean_terminated_length": 267.828125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.6457508376484923, "grad_norm": 1.0763232707977295, "kl": 0.6328125, "learning_rate": 1.9948721014908398e-05, "loss": 0.0004, "num_tokens": 10150154.0, "reward": -3.26318359375, "reward_std": 0.25883030891418457, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.0185546875, "rewards/ppl_reward/std": 4.704398155212402, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 299.921875, "completions/mean_terminated_length": 299.921875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.6469692354553762, "grad_norm": 1.0611963272094727, "kl": 0.5771484375, "learning_rate": 1.994785631499535e-05, "loss": -0.0583, "num_tokens": 10176229.0, "reward": -3.4111328125, "reward_std": 2.5894341468811035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.306640625, "rewards/ppl_reward/std": 15.75212287902832, "rewards/tag_count_reward/mean": 0.2421875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 305.640625, "completions/mean_terminated_length": 305.640625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.6481876332622601, "grad_norm": 1.1356159448623657, "kl": 0.576171875, "learning_rate": 1.994698440439785e-05, "loss": 0.0124, "num_tokens": 10203110.0, "reward": -3.10272216796875, "reward_std": 0.16603723168373108, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.7054443359375, "rewards/ppl_reward/std": 3.4912819862365723, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 292.4375, "completions/mean_terminated_length": 292.4375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.6494060310691441, "grad_norm": 1.035754919052124, "kl": 0.6083984375, "learning_rate": 1.99461052837479e-05, "loss": -0.0611, "num_tokens": 10229682.0, "reward": -3.7222900390625, "reward_std": 0.1604822278022766, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.944580078125, "rewards/ppl_reward/std": 5.202750205993652, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 296.796875, "completions/mean_terminated_length": 296.796875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.650624428876028, "grad_norm": 1.0556273460388184, "kl": 0.62109375, "learning_rate": 1.9945218953682736e-05, "loss": 0.0094, "num_tokens": 10255237.0, "reward": -3.17840576171875, "reward_std": 0.3083653151988983, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.8489990234375, "rewards/ppl_reward/std": 3.915818452835083, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 321.375, "completions/mean_terminated_length": 321.375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.6518428266829119, "grad_norm": 0.9826271533966064, "kl": 0.5712890625, "learning_rate": 1.9944325414844808e-05, "loss": -0.046, "num_tokens": 10282525.0, "reward": -3.4669189453125, "reward_std": 0.32758983969688416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.433837890625, "rewards/ppl_reward/std": 4.284818649291992, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 324.921875, "completions/mean_terminated_length": 324.921875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.6530612244897959, "grad_norm": 0.9358056783676147, "kl": 0.59375, "learning_rate": 1.994342466788179e-05, "loss": 0.0327, "num_tokens": 10310432.0, "reward": -2.7318115234375, "reward_std": 0.1605221927165985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.955810546875, "rewards/ppl_reward/std": 3.75834321975708, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03125, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 280.953125, "completions/mean_terminated_length": 280.953125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.6542796222966799, "grad_norm": 1.0235707759857178, "kl": 0.6376953125, "learning_rate": 1.99425167134466e-05, "loss": -0.1221, "num_tokens": 10334709.0, "reward": -3.004638671875, "reward_std": 0.3587857484817505, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.50927734375, "rewards/ppl_reward/std": 3.6573562622070312, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 284.65625, "completions/mean_terminated_length": 284.65625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.6554980201035638, "grad_norm": 0.9288952350616455, "kl": 0.7392578125, "learning_rate": 1.9941601552197358e-05, "loss": 0.0043, "num_tokens": 10360199.0, "reward": -2.81341552734375, "reward_std": 0.5256913900375366, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.1268310546875, "rewards/ppl_reward/std": 3.454808235168457, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 306.25, "completions/mean_terminated_length": 294.8571472167969, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.6567164179104478, "grad_norm": 1.087213158607483, "kl": 0.818359375, "learning_rate": 1.994067918479742e-05, "loss": 0.0433, "num_tokens": 10387079.0, "reward": -2.5469970703125, "reward_std": 0.29387742280960083, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.593994140625, "rewards/ppl_reward/std": 1.9810328483581543, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 249.484375, "completions/mean_terminated_length": 249.484375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.6579348157173317, "grad_norm": 1.1501327753067017, "kl": 0.9384765625, "learning_rate": 1.993974961191536e-05, "loss": 0.0158, "num_tokens": 10410222.0, "reward": -3.1802978515625, "reward_std": 0.306255966424942, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.860595703125, "rewards/ppl_reward/std": 4.87101936340332, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 206.296875, "completions/mean_terminated_length": 206.296875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.6591532135242156, "grad_norm": 1.2246493101119995, "kl": 1.083984375, "learning_rate": 1.9938812834224978e-05, "loss": 0.0037, "num_tokens": 10430985.0, "reward": -2.4344482421875, "reward_std": 0.17084375023841858, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.368896484375, "rewards/ppl_reward/std": 3.0247275829315186, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 192.046875, "completions/mean_terminated_length": 192.046875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.6603716113310996, "grad_norm": 1.2148998975753784, "kl": 1.04296875, "learning_rate": 1.9937868852405294e-05, "loss": -0.0295, "num_tokens": 10450260.0, "reward": -6.179931640625, "reward_std": 0.7112518548965454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -12.85986328125, "rewards/ppl_reward/std": 20.400039672851562, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 232.8125, "completions/mean_terminated_length": 232.8125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.6615900091379836, "grad_norm": 1.0120375156402588, "kl": 0.873046875, "learning_rate": 1.9936917667140556e-05, "loss": -0.0288, "num_tokens": 10471352.0, "reward": -4.3338623046875, "reward_std": 0.5246825218200684, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.167724609375, "rewards/ppl_reward/std": 7.183450698852539, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 199.890625, "completions/mean_terminated_length": 199.890625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.6628084069448675, "grad_norm": 1.3090897798538208, "kl": 0.884765625, "learning_rate": 1.9935959279120227e-05, "loss": -0.0298, "num_tokens": 10491433.0, "reward": -2.0936279296875, "reward_std": 0.17136383056640625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.687255859375, "rewards/ppl_reward/std": 1.456199288368225, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 222.921875, "completions/mean_terminated_length": 222.921875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.6640268047517515, "grad_norm": 1.0133775472640991, "kl": 0.806640625, "learning_rate": 1.993499368903899e-05, "loss": -0.0906, "num_tokens": 10512548.0, "reward": -2.0291748046875, "reward_std": 0.27637165784835815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.558349609375, "rewards/ppl_reward/std": 1.9104357957839966, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 212.09375, "completions/mean_terminated_length": 212.09375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.6652452025586354, "grad_norm": 1.1369421482086182, "kl": 0.8095703125, "learning_rate": 1.9934020897596752e-05, "loss": 0.0779, "num_tokens": 10532778.0, "reward": -3.284423828125, "reward_std": 0.3368569612503052, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.06884765625, "rewards/ppl_reward/std": 4.0656280517578125, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 186.9375, "completions/mean_terminated_length": 186.9375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.6664636003655193, "grad_norm": 1.1654012203216553, "kl": 0.755859375, "learning_rate": 1.993304090549864e-05, "loss": -0.0322, "num_tokens": 10551454.0, "reward": -3.5103759765625, "reward_std": 0.5766822695732117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.520751953125, "rewards/ppl_reward/std": 8.266345024108887, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 207.921875, "completions/mean_terminated_length": 207.921875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.6676819981724033, "grad_norm": 1.316530466079712, "kl": 0.71875, "learning_rate": 1.9932053713455e-05, "loss": -0.0625, "num_tokens": 10572313.0, "reward": -2.4234619140625, "reward_std": 0.17132604122161865, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.346923828125, "rewards/ppl_reward/std": 2.0465826988220215, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 209.984375, "completions/mean_terminated_length": 209.984375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.6689003959792872, "grad_norm": 1.2464618682861328, "kl": 0.724609375, "learning_rate": 1.9931059322181396e-05, "loss": 0.0356, "num_tokens": 10592472.0, "reward": -3.1021728515625, "reward_std": 0.3406951427459717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.704345703125, "rewards/ppl_reward/std": 1.7982673645019531, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 198.890625, "completions/mean_terminated_length": 198.890625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.6701187937861712, "grad_norm": 1.1973977088928223, "kl": 0.77734375, "learning_rate": 1.9930057732398608e-05, "loss": -0.0799, "num_tokens": 10611985.0, "reward": -3.5130615234375, "reward_std": 0.3961743116378784, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.526123046875, "rewards/ppl_reward/std": 4.9950103759765625, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 221.875, "completions/mean_terminated_length": 221.875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.6713371915930552, "grad_norm": 1.1992275714874268, "kl": 0.7119140625, "learning_rate": 1.9929048944832638e-05, "loss": -0.1353, "num_tokens": 10633361.0, "reward": -2.5902099609375, "reward_std": 0.2844415009021759, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.680419921875, "rewards/ppl_reward/std": 3.3127245903015137, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 223.171875, "completions/mean_terminated_length": 223.171875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.6725555893999391, "grad_norm": 1.179999828338623, "kl": 0.697265625, "learning_rate": 1.9928032960214707e-05, "loss": -0.0608, "num_tokens": 10654252.0, "reward": -2.869873046875, "reward_std": 0.29843389987945557, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.23974609375, "rewards/ppl_reward/std": 5.320001602172852, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 253.921875, "completions/mean_terminated_length": 253.921875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.673773987206823, "grad_norm": 1.0723716020584106, "kl": 0.697265625, "learning_rate": 1.9927009779281247e-05, "loss": -0.0377, "num_tokens": 10677599.0, "reward": -2.4794921875, "reward_std": 0.16582897305488586, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.458984375, "rewards/ppl_reward/std": 2.4257378578186035, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 274.265625, "completions/mean_terminated_length": 274.265625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.674992385013707, "grad_norm": 1.1195523738861084, "kl": 0.6650390625, "learning_rate": 1.9925979402773905e-05, "loss": 0.0263, "num_tokens": 10702432.0, "reward": -2.3564453125, "reward_std": 0.19869846105575562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.212890625, "rewards/ppl_reward/std": 2.1091196537017822, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 333.546875, "completions/mean_terminated_length": 322.5873107910156, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.6762107828205909, "grad_norm": 0.9843093156814575, "kl": 0.640625, "learning_rate": 1.992494183143955e-05, "loss": 0.0421, "num_tokens": 10730691.0, "reward": -2.6842041015625, "reward_std": 0.2999275326728821, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.868408203125, "rewards/ppl_reward/std": 2.521846055984497, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 327.0625, "completions/mean_terminated_length": 292.786865234375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.6774291806274749, "grad_norm": 1.0955085754394531, "kl": 0.599609375, "learning_rate": 1.992389706603027e-05, "loss": 0.1657, "num_tokens": 10758823.0, "reward": -4.0113525390625, "reward_std": 0.5288676023483276, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.522705078125, "rewards/ppl_reward/std": 4.8804192543029785, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 254.71875, "completions/mean_terminated_length": 254.71875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.6786475784343589, "grad_norm": 1.2323397397994995, "kl": 0.7568359375, "learning_rate": 1.992284510730335e-05, "loss": -0.1267, "num_tokens": 10782677.0, "reward": -2.0670166015625, "reward_std": 0.16358834505081177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.634033203125, "rewards/ppl_reward/std": 1.7090559005737305, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 294.25, "completions/mean_terminated_length": 294.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.6798659762412428, "grad_norm": 0.9277066588401794, "kl": 0.7255859375, "learning_rate": 1.992178595602131e-05, "loss": -0.0632, "num_tokens": 10808269.0, "reward": -2.9942626953125, "reward_std": 0.24482864141464233, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.488525390625, "rewards/ppl_reward/std": 1.3516725301742554, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 282.078125, "completions/mean_terminated_length": 282.078125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.6810843740481267, "grad_norm": 1.0883982181549072, "kl": 0.70703125, "learning_rate": 1.9920719612951868e-05, "loss": 0.0835, "num_tokens": 10833850.0, "reward": -2.58056640625, "reward_std": 0.2461843341588974, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.6611328125, "rewards/ppl_reward/std": 2.8031837940216064, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 294.359375, "completions/mean_terminated_length": 282.7778015136719, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.6823027718550106, "grad_norm": 1.3823381662368774, "kl": 0.794921875, "learning_rate": 1.991964607886796e-05, "loss": 0.1321, "num_tokens": 10861361.0, "reward": -3.2900390625, "reward_std": 0.46650680899620056, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.080078125, "rewards/ppl_reward/std": 7.210047721862793, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 290.578125, "completions/mean_terminated_length": 290.578125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.6835211696618946, "grad_norm": 0.983408510684967, "kl": 0.72265625, "learning_rate": 1.9918565354547738e-05, "loss": -0.0319, "num_tokens": 10886350.0, "reward": -2.709716796875, "reward_std": 0.34003525972366333, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.91943359375, "rewards/ppl_reward/std": 5.080273151397705, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 236.515625, "completions/mean_terminated_length": 236.515625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.6847395674687785, "grad_norm": 1.7730973958969116, "kl": 1.794921875, "learning_rate": 1.9917477440774564e-05, "loss": 0.1045, "num_tokens": 10909399.0, "reward": -2.23046875, "reward_std": 0.13891994953155518, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.96875, "rewards/ppl_reward/std": 2.2820041179656982, "rewards/tag_count_reward/mean": 0.25390625, "rewards/tag_count_reward/std": 0.03125, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 304.78125, "completions/mean_terminated_length": 293.3650817871094, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6859579652756626, "grad_norm": 2.871877908706665, "kl": 2.7578125, "learning_rate": 1.991638233833701e-05, "loss": 0.1484, "num_tokens": 10936585.0, "reward": -4.1597900390625, "reward_std": 0.7711964845657349, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.835205078125, "rewards/ppl_reward/std": 5.427613258361816, "rewards/tag_count_reward/mean": 0.2578125, "rewards/tag_count_reward/std": 0.0625, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 296.3125, "completions/mean_terminated_length": 296.3125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.6871763630825465, "grad_norm": 3.620156764984131, "kl": 1.62109375, "learning_rate": 1.9915280048028853e-05, "loss": 0.0694, "num_tokens": 10962925.0, "reward": -3.304931640625, "reward_std": 0.3792618215084076, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.19580078125, "rewards/ppl_reward/std": 3.9204583168029785, "rewards/tag_count_reward/mean": 0.29296875, "rewards/tag_count_reward/std": 0.09506355226039886, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 287.578125, "completions/mean_terminated_length": 275.888916015625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.6883947608894304, "grad_norm": 1.0928127765655518, "kl": 1.07421875, "learning_rate": 1.9914170570649093e-05, "loss": 0.0862, "num_tokens": 10988098.0, "reward": -3.331787109375, "reward_std": 1.2042783498764038, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.28076171875, "rewards/ppl_reward/std": 7.7428812980651855, "rewards/tag_count_reward/mean": 0.30859375, "rewards/tag_count_reward/std": 0.11566052585840225, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 341.765625, "completions/mean_terminated_length": 341.765625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.6896131586963143, "grad_norm": 1.0110557079315186, "kl": 0.8876953125, "learning_rate": 1.991305390700193e-05, "loss": 0.0393, "num_tokens": 11016667.0, "reward": -3.50958251953125, "reward_std": 0.8349400162696838, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.8629150390625, "rewards/ppl_reward/std": 5.574408531188965, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.17747680842876434, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 275.96875, "completions/mean_terminated_length": 275.96875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.6908315565031983, "grad_norm": 0.9615401029586792, "kl": 0.7822265625, "learning_rate": 1.9911930057896776e-05, "loss": -0.015, "num_tokens": 11040945.0, "reward": -3.4517822265625, "reward_std": 0.5582495331764221, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.747314453125, "rewards/ppl_reward/std": 3.516965389251709, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.1985812783241272, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 387.296875, "completions/mean_terminated_length": 366.758056640625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.6920499543100822, "grad_norm": 1.0141252279281616, "kl": 0.845703125, "learning_rate": 1.991079902414825e-05, "loss": -0.096, "num_tokens": 11072500.0, "reward": -3.51043701171875, "reward_std": 0.7793567180633545, "rewards/format_reward/mean": 0.078125, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.3489990234375, "rewards/ppl_reward/std": 5.358035087585449, "rewards/tag_count_reward/mean": 0.5859375, "rewards/tag_count_reward/std": 0.27537402510643005, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 504.796875, "completions/mean_terminated_length": 470.183349609375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.6932683521169661, "grad_norm": 0.6326061487197876, "kl": 0.611328125, "learning_rate": 1.9909660806576178e-05, "loss": 0.1235, "num_tokens": 11111791.0, "reward": -3.17822265625, "reward_std": 0.7190442085266113, "rewards/format_reward/mean": 0.109375, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.0205078125, "rewards/ppl_reward/std": 5.523508071899414, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.2958616316318512, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 584.171875, "completions/mean_terminated_length": 502.72222900390625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.6944867499238502, "grad_norm": 0.6169783473014832, "kl": 0.642578125, "learning_rate": 1.9908515406005597e-05, "loss": 0.0236, "num_tokens": 11157050.0, "reward": -2.18603515625, "reward_std": 0.6445656418800354, "rewards/format_reward/mean": 0.265625, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -6.4892578125, "rewards/ppl_reward/std": 4.1643595695495605, "rewards/tag_count_reward/mean": 0.79296875, "rewards/tag_count_reward/std": 0.27297118306159973, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 659.140625, "completions/mean_terminated_length": 547.448974609375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.6957051477307341, "grad_norm": 0.6386117935180664, "kl": 0.57421875, "learning_rate": 1.9907362823266752e-05, "loss": 0.0116, "num_tokens": 11206731.0, "reward": -1.9222412109375, "reward_std": 0.4537947177886963, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -6.024169921875, "rewards/ppl_reward/std": 3.3142917156219482, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.2287265807390213, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 629.125, "completions/mean_terminated_length": 528.4705810546875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.696923545537618, "grad_norm": 0.6294379234313965, "kl": 0.5419921875, "learning_rate": 1.990620305919508e-05, "loss": 0.0395, "num_tokens": 11253723.0, "reward": -1.78240966796875, "reward_std": 1.425115942955017, "rewards/format_reward/mean": 0.390625, "rewards/format_reward/std": 0.4917473793029785, "rewards/ppl_reward/mean": -5.9866943359375, "rewards/ppl_reward/std": 6.5848283767700195, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.2291666716337204, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 604.4375, "completions/mean_terminated_length": 497.4902038574219, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.698141943344502, "grad_norm": 0.6459609866142273, "kl": 0.5859375, "learning_rate": 1.9905036114631247e-05, "loss": 0.0927, "num_tokens": 11299175.0, "reward": -1.371337890625, "reward_std": 0.6855218410491943, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5029674172401428, "rewards/ppl_reward/mean": -5.50048828125, "rewards/ppl_reward/std": 3.2730114459991455, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.22545011341571808, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 542.03125, "completions/mean_terminated_length": 501.1864318847656, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.6993603411513859, "grad_norm": 0.6886385679244995, "kl": 0.587890625, "learning_rate": 1.99038619904211e-05, "loss": 0.1136, "num_tokens": 11340673.0, "reward": -1.3817138671875, "reward_std": 1.1968907117843628, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4787135720252991, "rewards/ppl_reward/mean": -5.849365234375, "rewards/ppl_reward/std": 5.179239749908447, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.20859359204769135, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 440.59375, "completions/mean_terminated_length": 391.1525573730469, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.7005787389582698, "grad_norm": 0.8125856518745422, "kl": 0.5986328125, "learning_rate": 1.9902680687415704e-05, "loss": 0.1743, "num_tokens": 11376527.0, "reward": -1.83544921875, "reward_std": 0.6897210478782654, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4787135720252991, "rewards/ppl_reward/mean": -6.7880859375, "rewards/ppl_reward/std": 3.2698023319244385, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.20710203051567078, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 332.140625, "completions/mean_terminated_length": 332.140625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.7017971367651539, "grad_norm": 0.8970351219177246, "kl": 0.69921875, "learning_rate": 1.9901492206471325e-05, "loss": 0.0159, "num_tokens": 11405096.0, "reward": -2.0914306640625, "reward_std": 1.1900802850723267, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -7.487548828125, "rewards/ppl_reward/std": 5.372919082641602, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.20225508511066437, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 279.71875, "completions/mean_terminated_length": 279.71875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.7030155345720378, "grad_norm": 0.9341403841972351, "kl": 0.7939453125, "learning_rate": 1.990029654844943e-05, "loss": 0.0446, "num_tokens": 11430142.0, "reward": -2.533447265625, "reward_std": 1.0576481819152832, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.4732423722743988, "rewards/ppl_reward/mean": -8.17626953125, "rewards/ppl_reward/std": 5.019115447998047, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.21807578206062317, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 237.796875, "completions/mean_terminated_length": 237.796875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.7042339323789217, "grad_norm": 3.4573848247528076, "kl": 0.9345703125, "learning_rate": 1.9899093714216695e-05, "loss": 0.0314, "num_tokens": 11452137.0, "reward": -6.75341796875, "reward_std": 1.5235927104949951, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.5, "rewards/ppl_reward/mean": -16.2646484375, "rewards/ppl_reward/std": 21.209333419799805, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.26065465807914734, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 248.125, "completions/mean_terminated_length": 248.125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.7054523301858057, "grad_norm": 2.1950223445892334, "kl": 1.3671875, "learning_rate": 1.9897883704644982e-05, "loss": -0.1564, "num_tokens": 11474097.0, "reward": -16.26806640625, "reward_std": 0.8965762257575989, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -32.9580078125, "rewards/ppl_reward/std": 25.01317596435547, "rewards/tag_count_reward/mean": 0.2109375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 270.359375, "completions/mean_terminated_length": 270.359375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7066707279926896, "grad_norm": 1.924148678779602, "kl": 1.10546875, "learning_rate": 1.9896666520611375e-05, "loss": -0.1988, "num_tokens": 11497968.0, "reward": -18.023193359375, "reward_std": 0.2731763422489166, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -36.47607421875, "rewards/ppl_reward/std": 50.80846405029297, "rewards/tag_count_reward/mean": 0.21484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 355.296875, "completions/mean_terminated_length": 355.296875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.7078891257995735, "grad_norm": 1.420951247215271, "kl": 0.994140625, "learning_rate": 1.9895442162998136e-05, "loss": 0.008, "num_tokens": 11527851.0, "reward": -4.548583984375, "reward_std": 1.0965310335159302, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.74560546875, "rewards/ppl_reward/std": 4.2434258460998535, "rewards/tag_count_reward/mean": 0.32421875, "rewards/tag_count_reward/std": 0.23435020446777344, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 496.546875, "completions/mean_terminated_length": 461.38336181640625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.7091075236064575, "grad_norm": 1.0430412292480469, "kl": 0.8427734375, "learning_rate": 1.9894210632692745e-05, "loss": -0.0583, "num_tokens": 11567398.0, "reward": -5.9923095703125, "reward_std": 2.577831268310547, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -12.804931640625, "rewards/ppl_reward/std": 17.12676239013672, "rewards/tag_count_reward/mean": 0.41015625, "rewards/tag_count_reward/std": 0.25341787934303284, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 604.453125, "completions/mean_terminated_length": 464.60418701171875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.7103259214133415, "grad_norm": 0.6204935908317566, "kl": 0.66015625, "learning_rate": 1.9892971930587873e-05, "loss": -0.1055, "num_tokens": 11612523.0, "reward": -25.51361083984375, "reward_std": 23.643627166748047, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -51.5194091796875, "rewards/ppl_reward/std": 132.74102783203125, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.26908203959465027, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 694.140625, "completions/mean_terminated_length": 437.5833435058594, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.7115443192202254, "grad_norm": 0.5929585099220276, "kl": 0.615234375, "learning_rate": 1.9891726057581394e-05, "loss": -0.2188, "num_tokens": 11663020.0, "reward": -21.7869873046875, "reward_std": 15.120370864868164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -43.917724609375, "rewards/ppl_reward/std": 114.28770446777344, "rewards/tag_count_reward/mean": 0.171875, "rewards/tag_count_reward/std": 0.27048972249031067, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 911.4375, "completions/mean_terminated_length": 423.66668701171875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.7127627170271094, "grad_norm": 0.5194575786590576, "kl": 0.50732421875, "learning_rate": 1.9890473014576375e-05, "loss": -0.091, "num_tokens": 11729912.0, "reward": -3.2047119140625, "reward_std": 0.5841090679168701, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.768798828125, "rewards/ppl_reward/std": 2.8564579486846924, "rewards/tag_count_reward/mean": 0.1796875, "rewards/tag_count_reward/std": 0.26899561285972595, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 1015.8125, "completions/mean_terminated_length": 500.0, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.7139811148339933, "grad_norm": 0.47922173142433167, "kl": 0.50146484375, "learning_rate": 1.9889212802481076e-05, "loss": 0.0048, "num_tokens": 11801812.0, "reward": -5.6849365234375, "reward_std": 1.5560126304626465, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -11.526123046875, "rewards/ppl_reward/std": 8.045451164245605, "rewards/tag_count_reward/mean": 0.078125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7151995126408772, "grad_norm": 0.5177332758903503, "kl": 0.51123046875, "learning_rate": 1.9887945422208966e-05, "loss": 0.0204, "num_tokens": 11874012.0, "reward": -6.704833984375, "reward_std": 2.2849936485290527, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -13.67529296875, "rewards/ppl_reward/std": 16.024246215820312, "rewards/tag_count_reward/mean": 0.1328125, "rewards/tag_count_reward/std": 0.21807578206062317, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 1016.8125, "completions/mean_terminated_length": 564.0, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.7164179104477612, "grad_norm": 0.47401943802833557, "kl": 0.46533203125, "learning_rate": 1.9886670874678705e-05, "loss": 0.0199, "num_tokens": 11945552.0, "reward": -12.06494140625, "reward_std": 11.042444229125977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -24.3173828125, "rewards/ppl_reward/std": 67.16102600097656, "rewards/tag_count_reward/mean": 0.09375, "rewards/tag_count_reward/std": 0.18633900582790375, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7176363082546452, "grad_norm": 0.4895690977573395, "kl": 0.47412109375, "learning_rate": 1.9885389160814136e-05, "loss": 0.019, "num_tokens": 12017280.0, "reward": -4.5682373046875, "reward_std": 1.375931978225708, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.284912109375, "rewards/ppl_reward/std": 5.435562610626221, "rewards/tag_count_reward/mean": 0.07421875, "rewards/tag_count_reward/std": 0.19760315120220184, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7188547060615291, "grad_norm": 0.5042840838432312, "kl": 0.44921875, "learning_rate": 1.9884100281544317e-05, "loss": 0.018, "num_tokens": 12090064.0, "reward": -4.7313232421875, "reward_std": 2.266697645187378, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.712646484375, "rewards/ppl_reward/std": 13.127964973449707, "rewards/tag_count_reward/mean": 0.125, "rewards/tag_count_reward/std": 0.19920477271080017, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7200731038684131, "grad_norm": 0.4530857503414154, "kl": 0.4287109375, "learning_rate": 1.9882804237803487e-05, "loss": 0.0171, "num_tokens": 12162520.0, "reward": -3.689453125, "reward_std": 1.2992092370986938, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.74609375, "rewards/ppl_reward/std": 4.443774700164795, "rewards/tag_count_reward/mean": 0.18359375, "rewards/tag_count_reward/std": 0.2754584550857544, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.721291501675297, "grad_norm": 0.4953289330005646, "kl": 0.4580078125, "learning_rate": 1.9881501030531078e-05, "loss": 0.0183, "num_tokens": 12234416.0, "reward": -5.3997802734375, "reward_std": 2.306555986404419, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -11.143310546875, "rewards/ppl_reward/std": 15.838173866271973, "rewards/tag_count_reward/mean": 0.171875, "rewards/tag_count_reward/std": 0.23935678601264954, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7225098994821809, "grad_norm": 0.4694961607456207, "kl": 0.4345703125, "learning_rate": 1.988019066067172e-05, "loss": 0.0174, "num_tokens": 12307104.0, "reward": -2.56787109375, "reward_std": 0.9187731146812439, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.4951171875, "rewards/ppl_reward/std": 3.0773425102233887, "rewards/tag_count_reward/mean": 0.1796875, "rewards/tag_count_reward/std": 0.2538151443004608, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7237282972890648, "grad_norm": 0.5022544860839844, "kl": 0.4150390625, "learning_rate": 1.9878873129175228e-05, "loss": 0.0166, "num_tokens": 12380488.0, "reward": -3.72509765625, "reward_std": 1.1129310131072998, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.8408203125, "rewards/ppl_reward/std": 5.472503185272217, "rewards/tag_count_reward/mean": 0.1953125, "rewards/tag_count_reward/std": 0.24587368965148926, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 1023.46875, "completions/mean_terminated_length": 990.0, "completions/min_length": 990.0, "completions/min_terminated_length": 990.0, "epoch": 0.7249466950959488, "grad_norm": 0.57322758436203, "kl": 0.484375, "learning_rate": 1.9877548436996622e-05, "loss": 0.0195, "num_tokens": 12452542.0, "reward": -5.6279296875, "reward_std": 1.8592138290405273, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -11.583984375, "rewards/ppl_reward/std": 9.20559024810791, "rewards/tag_count_reward/mean": 0.1640625, "rewards/tag_count_reward/std": 0.2280818521976471, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7261650929028328, "grad_norm": 0.529068112373352, "kl": 0.44873046875, "learning_rate": 1.9876216585096087e-05, "loss": 0.0179, "num_tokens": 12525230.0, "reward": -4.42730712890625, "reward_std": 1.2076530456542969, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.2374267578125, "rewards/ppl_reward/std": 5.282032012939453, "rewards/tag_count_reward/mean": 0.19140625, "rewards/tag_count_reward/std": 0.23880568146705627, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 1015.234375, "completions/mean_terminated_length": 463.0, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.7273834907097168, "grad_norm": 0.5328705310821533, "kl": 0.49072265625, "learning_rate": 1.9874877574439028e-05, "loss": 0.0012, "num_tokens": 12597389.0, "reward": -4.312255859375, "reward_std": 1.4302916526794434, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.00732421875, "rewards/ppl_reward/std": 8.223088264465332, "rewards/tag_count_reward/mean": 0.19140625, "rewards/tag_count_reward/std": 0.25874462723731995, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7286018885166007, "grad_norm": 0.5225391983985901, "kl": 0.4677734375, "learning_rate": 1.9873531405996017e-05, "loss": 0.0187, "num_tokens": 12670197.0, "reward": -3.700927734375, "reward_std": 0.8865249752998352, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.92529296875, "rewards/ppl_reward/std": 4.520557880401611, "rewards/tag_count_reward/mean": 0.26171875, "rewards/tag_count_reward/std": 0.26885148882865906, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7298202863234846, "grad_norm": 0.5109557509422302, "kl": 0.4697265625, "learning_rate": 1.9872178080742822e-05, "loss": 0.0188, "num_tokens": 12742853.0, "reward": -3.504150390625, "reward_std": 0.9090569019317627, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.48486328125, "rewards/ppl_reward/std": 3.9304990768432617, "rewards/tag_count_reward/mean": 0.23828125, "rewards/tag_count_reward/std": 0.22016267478466034, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 1017.328125, "completions/mean_terminated_length": 810.5, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "epoch": 0.7310386841303685, "grad_norm": 0.8098949790000916, "kl": 0.5234375, "learning_rate": 1.98708175996604e-05, "loss": 0.0234, "num_tokens": 12815210.0, "reward": -3.79345703125, "reward_std": 1.162414312362671, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.1025390625, "rewards/ppl_reward/std": 3.9136555194854736, "rewards/tag_count_reward/mean": 0.2578125, "rewards/tag_count_reward/std": 0.24384792149066925, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 1024.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 974.390625, "completions/mean_terminated_length": 494.8333435058594, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.7322570819372525, "grad_norm": 0.554299533367157, "kl": 0.50146484375, "learning_rate": 1.9869449963734894e-05, "loss": 0.0201, "num_tokens": 12884651.0, "reward": -3.553955078125, "reward_std": 1.4383723735809326, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.69384765625, "rewards/ppl_reward/std": 7.046789646148682, "rewards/tag_count_reward/mean": 0.29296875, "rewards/tag_count_reward/std": 0.22953839600086212, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 935.421875, "completions/mean_terminated_length": 619.0714721679688, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.7334754797441365, "grad_norm": 0.5285274386405945, "kl": 0.5166015625, "learning_rate": 1.9868075173957632e-05, "loss": -0.0286, "num_tokens": 12951734.0, "reward": -5.74932861328125, "reward_std": 1.4486827850341797, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -12.0689697265625, "rewards/ppl_reward/std": 12.789775848388672, "rewards/tag_count_reward/mean": 0.28515625, "rewards/tag_count_reward/std": 0.23088505864143372, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 811.890625, "completions/mean_terminated_length": 481.0, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.7346938775510204, "grad_norm": 0.5185193419456482, "kl": 0.5244140625, "learning_rate": 1.986669323132512e-05, "loss": -0.0833, "num_tokens": 13010391.0, "reward": -3.5419921875, "reward_std": 2.1396889686584473, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.708984375, "rewards/ppl_reward/std": 7.798243045806885, "rewards/tag_count_reward/mean": 0.3125, "rewards/tag_count_reward/std": 0.2182178944349289, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 703.421875, "completions/mean_terminated_length": 454.0833435058594, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.7359122753579044, "grad_norm": 0.6380831599235535, "kl": 0.576171875, "learning_rate": 1.9865304136839066e-05, "loss": -0.2009, "num_tokens": 13062394.0, "reward": -5.571044921875, "reward_std": 2.311316967010498, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -11.78271484375, "rewards/ppl_reward/std": 16.783742904663086, "rewards/tag_count_reward/mean": 0.3203125, "rewards/tag_count_reward/std": 0.23345555365085602, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 653.875, "completions/mean_terminated_length": 366.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.7371306731647883, "grad_norm": 0.6573635935783386, "kl": 0.552734375, "learning_rate": 1.9863907891506348e-05, "loss": -0.1842, "num_tokens": 13111218.0, "reward": -4.576171875, "reward_std": 1.5337538719177246, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.66015625, "rewards/ppl_reward/std": 4.7189555168151855, "rewards/tag_count_reward/mean": 0.25390625, "rewards/tag_count_reward/std": 0.18629740178585052, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 560.9375, "completions/mean_terminated_length": 334.79071044921875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7383490709716722, "grad_norm": 0.7630788683891296, "kl": 0.599609375, "learning_rate": 1.9862504496339036e-05, "loss": -0.3007, "num_tokens": 13153886.0, "reward": -5.2147216796875, "reward_std": 2.083787441253662, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -10.999755859375, "rewards/ppl_reward/std": 7.765792369842529, "rewards/tag_count_reward/mean": 0.28515625, "rewards/tag_count_reward/std": 0.1882837563753128, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 671.703125, "completions/mean_terminated_length": 360.8529357910156, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7395674687785562, "grad_norm": 0.5553275346755981, "kl": 0.537109375, "learning_rate": 1.9861093952354372e-05, "loss": -0.2016, "num_tokens": 13203475.0, "reward": -8.3385009765625, "reward_std": 3.1268296241760254, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -17.145751953125, "rewards/ppl_reward/std": 22.16503143310547, "rewards/tag_count_reward/mean": 0.234375, "rewards/tag_count_reward/std": 0.23935678601264954, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 803.84375, "completions/mean_terminated_length": 383.54547119140625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.7407858665854401, "grad_norm": 0.5105798244476318, "kl": 0.48046875, "learning_rate": 1.9859676260574792e-05, "loss": -0.0417, "num_tokens": 13261873.0, "reward": -4.48828125, "reward_std": 1.3504953384399414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.4140625, "rewards/ppl_reward/std": 7.270249843597412, "rewards/tag_count_reward/mean": 0.21875, "rewards/tag_count_reward/std": 0.22047927975654602, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 924.140625, "completions/mean_terminated_length": 648.058837890625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.7420042643923241, "grad_norm": 0.5686663389205933, "kl": 0.47802734375, "learning_rate": 1.9858251422027903e-05, "loss": 0.0192, "num_tokens": 13327642.0, "reward": -3.55322265625, "reward_std": 0.6896792650222778, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.4970703125, "rewards/ppl_reward/std": 3.2810006141662598, "rewards/tag_count_reward/mean": 0.1953125, "rewards/tag_count_reward/std": 0.2291666716337204, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 860.734375, "completions/mean_terminated_length": 663.6896362304688, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7432226621992081, "grad_norm": 0.5547820925712585, "kl": 0.49560546875, "learning_rate": 1.9856819437746496e-05, "loss": -0.0139, "num_tokens": 13389329.0, "reward": -7.7197265625, "reward_std": 1.5389583110809326, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -15.783203125, "rewards/ppl_reward/std": 11.438165664672852, "rewards/tag_count_reward/mean": 0.171875, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 697.453125, "completions/mean_terminated_length": 549.0227661132812, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.744441060006092, "grad_norm": 0.4766968786716461, "kl": 0.43408203125, "learning_rate": 1.9855380308768546e-05, "loss": -0.0753, "num_tokens": 13441238.0, "reward": -3.1387939453125, "reward_std": 0.8090713024139404, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.879150390625, "rewards/ppl_reward/std": 2.412499189376831, "rewards/tag_count_reward/mean": 0.30078125, "rewards/tag_count_reward/std": 0.2679274082183838, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 463.96875, "completions/mean_terminated_length": 360.2592468261719, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.7456594578129759, "grad_norm": 0.7151383757591248, "kl": 0.546875, "learning_rate": 1.98539340361372e-05, "loss": -0.037, "num_tokens": 13477668.0, "reward": -4.1136474609375, "reward_std": 1.311807632446289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.883544921875, "rewards/ppl_reward/std": 5.4152398109436035, "rewards/tag_count_reward/mean": 0.328125, "rewards/tag_count_reward/std": 0.2592533528804779, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 424.921875, "completions/mean_terminated_length": 384.9833679199219, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.7468778556198599, "grad_norm": 0.6760555505752563, "kl": 0.5107421875, "learning_rate": 1.985248062090079e-05, "loss": -0.0481, "num_tokens": 13511351.0, "reward": -9.322998046875, "reward_std": 4.069204330444336, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -19.38818359375, "rewards/ppl_reward/std": 25.61249351501465, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.27091917395591736, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 344.734375, "completions/mean_terminated_length": 344.734375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7480962534267438, "grad_norm": 0.7740457057952881, "kl": 0.498046875, "learning_rate": 1.9851020064112813e-05, "loss": -0.186, "num_tokens": 13540334.0, "reward": -3.1982421875, "reward_std": 1.366987705230713, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.029296875, "rewards/ppl_reward/std": 4.314981937408447, "rewards/tag_count_reward/mean": 0.31640625, "rewards/tag_count_reward/std": 0.30616089701652527, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 339.0, "completions/mean_terminated_length": 339.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.7493146512336277, "grad_norm": 0.8301798105239868, "kl": 0.48291015625, "learning_rate": 1.984955236683196e-05, "loss": -0.0829, "num_tokens": 13569038.0, "reward": -5.051513671875, "reward_std": 2.29179048538208, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -10.96240234375, "rewards/ppl_reward/std": 10.044400215148926, "rewards/tag_count_reward/mean": 0.4296875, "rewards/tag_count_reward/std": 0.26528194546699524, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 303.453125, "completions/mean_terminated_length": 303.453125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7505330490405118, "grad_norm": 0.8497047424316406, "kl": 0.5458984375, "learning_rate": 1.9848077530122083e-05, "loss": -0.0153, "num_tokens": 13595059.0, "reward": -6.681396484375, "reward_std": 4.16239070892334, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -14.40185546875, "rewards/ppl_reward/std": 24.96535301208496, "rewards/tag_count_reward/mean": 0.51953125, "rewards/tag_count_reward/std": 0.3708224594593048, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 299.953125, "completions/mean_terminated_length": 299.953125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.7517514468473957, "grad_norm": 0.9789483547210693, "kl": 0.50927734375, "learning_rate": 1.9846595555052214e-05, "loss": 0.0758, "num_tokens": 13621096.0, "reward": -4.7142333984375, "reward_std": 1.6337147951126099, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -10.623779296875, "rewards/ppl_reward/std": 8.558329582214355, "rewards/tag_count_reward/mean": 0.59765625, "rewards/tag_count_reward/std": 0.3008492887020111, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 311.25, "completions/mean_terminated_length": 311.25, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.7529698446542796, "grad_norm": 0.797239363193512, "kl": 0.53662109375, "learning_rate": 1.9845106442696563e-05, "loss": -0.0441, "num_tokens": 13647584.0, "reward": -9.1976318359375, "reward_std": 6.205406665802002, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -19.403076171875, "rewards/ppl_reward/std": 27.139991760253906, "rewards/tag_count_reward/mean": 0.50390625, "rewards/tag_count_reward/std": 0.3435695767402649, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 316.265625, "completions/mean_terminated_length": 316.265625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.7541882424611636, "grad_norm": 0.8737626671791077, "kl": 0.509765625, "learning_rate": 1.984361019413451e-05, "loss": 0.0297, "num_tokens": 13675313.0, "reward": -3.6070556640625, "reward_std": 1.3398886919021606, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.409423828125, "rewards/ppl_reward/std": 7.695198059082031, "rewards/tag_count_reward/mean": 0.59765625, "rewards/tag_count_reward/std": 0.30737364292144775, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 294.109375, "completions/mean_terminated_length": 294.109375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.7554066402680475, "grad_norm": 1.0370687246322632, "kl": 0.55419921875, "learning_rate": 1.9842106810450605e-05, "loss": 0.1322, "num_tokens": 13701168.0, "reward": -2.553955078125, "reward_std": 0.7130699157714844, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.30322265625, "rewards/ppl_reward/std": 3.2123069763183594, "rewards/tag_count_reward/mean": 0.59765625, "rewards/tag_count_reward/std": 0.3291921019554138, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 219.421875, "completions/mean_terminated_length": 219.421875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.7566250380749314, "grad_norm": 1.0845682621002197, "kl": 0.52734375, "learning_rate": 1.9840596292734573e-05, "loss": -0.1858, "num_tokens": 13722419.0, "reward": -4.0394287109375, "reward_std": 1.4597573280334473, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.469482421875, "rewards/ppl_reward/std": 5.370580673217773, "rewards/tag_count_reward/mean": 0.6953125, "rewards/tag_count_reward/std": 0.3101099133491516, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 212.859375, "completions/mean_terminated_length": 212.859375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.7578434358818155, "grad_norm": 0.9862907528877258, "kl": 0.599609375, "learning_rate": 1.9839078642081312e-05, "loss": -0.0715, "num_tokens": 13742354.0, "reward": -3.45751953125, "reward_std": 2.13216495513916, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.5244140625, "rewards/ppl_reward/std": 7.7721710205078125, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.2798410654067993, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 266.875, "completions/mean_terminated_length": 266.875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.7590618336886994, "grad_norm": 1.1111695766448975, "kl": 0.5068359375, "learning_rate": 1.9837553859590888e-05, "loss": 0.0032, "num_tokens": 13767066.0, "reward": -2.959716796875, "reward_std": 0.6907594799995422, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.65380859375, "rewards/ppl_reward/std": 4.0609450340271, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.25185325741767883, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 273.140625, "completions/mean_terminated_length": 273.140625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7602802314955833, "grad_norm": 1.0470739603042603, "kl": 0.5, "learning_rate": 1.9836021946368538e-05, "loss": 0.0625, "num_tokens": 13791899.0, "reward": -3.356689453125, "reward_std": 0.8937036991119385, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.47119140625, "rewards/ppl_reward/std": 8.292607307434082, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2781464755535126, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 278.46875, "completions/mean_terminated_length": 278.46875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.7614986293024673, "grad_norm": 1.043483853340149, "kl": 0.5283203125, "learning_rate": 1.9834482903524663e-05, "loss": 0.063, "num_tokens": 13816857.0, "reward": -1.7589111328125, "reward_std": 0.2123696357011795, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.431884765625, "rewards/ppl_reward/std": 2.6352083683013916, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1224314495921135, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 261.28125, "completions/mean_terminated_length": 261.28125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.7627170271093512, "grad_norm": 1.1178842782974243, "kl": 0.51953125, "learning_rate": 1.9832936732174835e-05, "loss": 0.0197, "num_tokens": 13840139.0, "reward": -1.571044921875, "reward_std": 0.40491923689842224, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.04052734375, "rewards/ppl_reward/std": 1.9970365762710571, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 277.09375, "completions/mean_terminated_length": 277.09375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.7639354249162351, "grad_norm": 1.0909804105758667, "kl": 0.603515625, "learning_rate": 1.9831383433439798e-05, "loss": -0.0535, "num_tokens": 13864889.0, "reward": -9.450927734375, "reward_std": 11.623905181884766, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -20.76904296875, "rewards/ppl_reward/std": 62.436187744140625, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.1994769275188446, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 318.140625, "completions/mean_terminated_length": 318.140625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.765153822723119, "grad_norm": 1.099402666091919, "kl": 0.5537109375, "learning_rate": 1.982982300844545e-05, "loss": 0.027, "num_tokens": 13892186.0, "reward": -1.89013671875, "reward_std": 0.5846515893936157, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.3193359375, "rewards/ppl_reward/std": 3.5228323936462402, "rewards/tag_count_reward/mean": 0.75390625, "rewards/tag_count_reward/std": 0.26908203959465027, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 247.3125, "completions/mean_terminated_length": 247.3125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.7663722205300031, "grad_norm": 0.9433051943778992, "kl": 0.57421875, "learning_rate": 1.982825545832287e-05, "loss": -0.2699, "num_tokens": 13914638.0, "reward": -4.552734375, "reward_std": 2.7130818367004395, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -10.69140625, "rewards/ppl_reward/std": 11.191428184509277, "rewards/tag_count_reward/mean": 0.79296875, "rewards/tag_count_reward/std": 0.35788923501968384, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 291.609375, "completions/mean_terminated_length": 291.609375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.767590618336887, "grad_norm": 0.9559817910194397, "kl": 0.58203125, "learning_rate": 1.9826680784208293e-05, "loss": -0.1724, "num_tokens": 13940045.0, "reward": -3.5687255859375, "reward_std": 1.9771555662155151, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -8.864013671875, "rewards/ppl_reward/std": 5.716294288635254, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.2907884418964386, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 313.28125, "completions/mean_terminated_length": 313.28125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.768809016143771, "grad_norm": 0.9314897060394287, "kl": 0.55078125, "learning_rate": 1.982509898724311e-05, "loss": -0.1328, "num_tokens": 13967135.0, "reward": -4.3653564453125, "reward_std": 0.9705976843833923, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -10.363525390625, "rewards/ppl_reward/std": 12.792856216430664, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.31573230028152466, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 377.15625, "completions/mean_terminated_length": 356.2903137207031, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.7700274139506549, "grad_norm": 0.9278242588043213, "kl": 0.55615234375, "learning_rate": 1.9823510068573895e-05, "loss": 0.0781, "num_tokens": 13998289.0, "reward": -1.7269287109375, "reward_std": 0.2982850968837738, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.313232421875, "rewards/ppl_reward/std": 2.519573211669922, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19654127955436707, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 408.421875, "completions/mean_terminated_length": 408.421875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.7712458117575388, "grad_norm": 0.8101586699485779, "kl": 0.53173828125, "learning_rate": 1.9821914029352364e-05, "loss": -0.0153, "num_tokens": 14031268.0, "reward": -2.48931884765625, "reward_std": 0.7318626642227173, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.8302001953125, "rewards/ppl_reward/std": 3.85683536529541, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.13858474791049957, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 380.640625, "completions/mean_terminated_length": 380.640625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.7724642095644227, "grad_norm": 0.9323115944862366, "kl": 0.537109375, "learning_rate": 1.9820310870735404e-05, "loss": 0.0341, "num_tokens": 14062685.0, "reward": -3.434326171875, "reward_std": 0.6841224431991577, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -8.83740234375, "rewards/ppl_reward/std": 5.336989879608154, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 394.9375, "completions/mean_terminated_length": 394.9375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.7736826073713068, "grad_norm": 0.8510453701019287, "kl": 0.55419921875, "learning_rate": 1.9818700593885066e-05, "loss": 0.0265, "num_tokens": 14094289.0, "reward": -1.3372802734375, "reward_std": 0.3501349985599518, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.729248046875, "rewards/ppl_reward/std": 1.8285417556762695, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 445.484375, "completions/mean_terminated_length": 445.484375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.7749010051781907, "grad_norm": 0.8890380859375, "kl": 0.521484375, "learning_rate": 1.9817083199968552e-05, "loss": -0.0191, "num_tokens": 14129704.0, "reward": -2.25927734375, "reward_std": 0.3044129014015198, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.4794921875, "rewards/ppl_reward/std": 3.143127918243408, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 452.515625, "completions/mean_terminated_length": 443.4444580078125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.7761194029850746, "grad_norm": 0.8716006278991699, "kl": 0.50927734375, "learning_rate": 1.9815458690158226e-05, "loss": 0.0317, "num_tokens": 14165153.0, "reward": -2.4483642578125, "reward_std": 1.6992106437683105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.873291015625, "rewards/ppl_reward/std": 9.192977905273438, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 505.625, "completions/mean_terminated_length": 505.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.7773378007919586, "grad_norm": 0.8756385445594788, "kl": 0.49853515625, "learning_rate": 1.9813827065631617e-05, "loss": 0.0215, "num_tokens": 14203641.0, "reward": -2.16015625, "reward_std": 0.37231582403182983, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.3359375, "rewards/ppl_reward/std": 2.9719698429107666, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 534.828125, "completions/mean_terminated_length": 527.0635375976562, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.7785561985988425, "grad_norm": 0.8076552152633667, "kl": 0.46435546875, "learning_rate": 1.98121883275714e-05, "loss": 0.0917, "num_tokens": 14245198.0, "reward": -2.45458984375, "reward_std": 0.39028847217559814, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.8544921875, "rewards/ppl_reward/std": 3.59455943107605, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 709.21875, "completions/mean_terminated_length": 650.9259033203125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.7797745964057264, "grad_norm": 0.7756741642951965, "kl": 0.44189453125, "learning_rate": 1.981054247716541e-05, "loss": 0.1542, "num_tokens": 14297908.0, "reward": -3.3614501953125, "reward_std": 1.566745400428772, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.457275390625, "rewards/ppl_reward/std": 6.130821228027344, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.29536348581314087, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 644.703125, "completions/mean_terminated_length": 619.4166870117188, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.7809929942126104, "grad_norm": 0.8561777472496033, "kl": 0.47265625, "learning_rate": 1.9808889515606644e-05, "loss": 0.0737, "num_tokens": 14346729.0, "reward": -4.25244140625, "reward_std": 1.0398238897323608, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -10.3720703125, "rewards/ppl_reward/std": 6.118724346160889, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21931616961956024, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 514.796875, "completions/mean_terminated_length": 506.7143249511719, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.7822113920194944, "grad_norm": 0.7965400815010071, "kl": 0.47900390625, "learning_rate": 1.980722944409324e-05, "loss": -0.0204, "num_tokens": 14388276.0, "reward": -1.203857421875, "reward_std": 0.13911627233028412, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.36865234375, "rewards/ppl_reward/std": 1.837433099746704, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 457.421875, "completions/mean_terminated_length": 448.4285888671875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.7834297898263783, "grad_norm": 0.8186072111129761, "kl": 0.50634765625, "learning_rate": 1.9805562263828505e-05, "loss": -0.0163, "num_tokens": 14424423.0, "reward": -2.338134765625, "reward_std": 0.28838884830474854, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.69970703125, "rewards/ppl_reward/std": 1.1912875175476074, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 432.546875, "completions/mean_terminated_length": 432.546875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.7846481876332623, "grad_norm": 0.9490050077438354, "kl": 0.482421875, "learning_rate": 1.980388797602089e-05, "loss": 0.0027, "num_tokens": 14459354.0, "reward": -3.9169921875, "reward_std": 0.8083292245864868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -9.802734375, "rewards/ppl_reward/std": 7.656937599182129, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 382.1875, "completions/mean_terminated_length": 382.1875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.7858665854401462, "grad_norm": 0.921780526638031, "kl": 0.50732421875, "learning_rate": 1.9802206581883992e-05, "loss": -0.0584, "num_tokens": 14490662.0, "reward": -2.24853515625, "reward_std": 0.445026159286499, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -6.4736328125, "rewards/ppl_reward/std": 5.081242084503174, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 377.390625, "completions/mean_terminated_length": 377.390625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.7870849832470301, "grad_norm": 0.9873285293579102, "kl": 0.5302734375, "learning_rate": 1.980051808263658e-05, "loss": 0.042, "num_tokens": 14521607.0, "reward": -5.887939453125, "reward_std": 0.8872402906417847, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -13.79150390625, "rewards/ppl_reward/std": 10.592622756958008, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 397.171875, "completions/mean_terminated_length": 397.171875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.7883033810539141, "grad_norm": 1.0123218297958374, "kl": 0.53466796875, "learning_rate": 1.9798822479502547e-05, "loss": 0.0679, "num_tokens": 14554466.0, "reward": -2.9088134765625, "reward_std": 0.4517940282821655, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.817626953125, "rewards/ppl_reward/std": 4.045974254608154, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 351.5625, "completions/mean_terminated_length": 351.5625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.789521778860798, "grad_norm": 0.9960734248161316, "kl": 0.5634765625, "learning_rate": 1.9797119773710952e-05, "loss": -0.0274, "num_tokens": 14583542.0, "reward": -3.419189453125, "reward_std": 0.7581765651702881, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -8.90087890625, "rewards/ppl_reward/std": 6.258288860321045, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 393.984375, "completions/mean_terminated_length": 393.984375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.790740176667682, "grad_norm": 1.0195372104644775, "kl": 0.52294921875, "learning_rate": 1.9795409966496e-05, "loss": -0.0369, "num_tokens": 14616325.0, "reward": -1.892333984375, "reward_std": 0.3276379108428955, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.78466796875, "rewards/ppl_reward/std": 2.1719284057617188, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 401.515625, "completions/mean_terminated_length": 401.515625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.791958574474566, "grad_norm": 0.9608635902404785, "kl": 0.533203125, "learning_rate": 1.9793693059097035e-05, "loss": 0.0064, "num_tokens": 14649742.0, "reward": -2.1746826171875, "reward_std": 0.31859293580055237, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.458740234375, "rewards/ppl_reward/std": 6.830774784088135, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 389.96875, "completions/mean_terminated_length": 389.96875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.7931769722814499, "grad_norm": 1.0644972324371338, "kl": 0.5498046875, "learning_rate": 1.9791969052758563e-05, "loss": 0.0627, "num_tokens": 14681772.0, "reward": -4.17510986328125, "reward_std": 0.776065468788147, "rewards/format_reward/mean": 0.140625, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -10.5689697265625, "rewards/ppl_reward/std": 10.262901306152344, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09449111670255661, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 340.890625, "completions/mean_terminated_length": 340.890625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.7943953700883338, "grad_norm": 1.0535589456558228, "kl": 0.5712890625, "learning_rate": 1.979023794873022e-05, "loss": -0.0531, "num_tokens": 14709989.0, "reward": -2.4053955078125, "reward_std": 0.48129594326019287, "rewards/format_reward/mean": 0.140625, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.076416015625, "rewards/ppl_reward/std": 5.6655449867248535, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 343.3125, "completions/mean_terminated_length": 343.3125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.7956137678952178, "grad_norm": 1.0316293239593506, "kl": 0.52685546875, "learning_rate": 1.9788499748266797e-05, "loss": -0.0002, "num_tokens": 14738585.0, "reward": -2.54638671875, "reward_std": 0.9201515913009644, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.5, "rewards/ppl_reward/mean": -8.1396484375, "rewards/ppl_reward/std": 4.920105934143066, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1110801100730896, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 317.015625, "completions/mean_terminated_length": 317.015625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.7968321657021017, "grad_norm": 1.0708513259887695, "kl": 0.5556640625, "learning_rate": 1.9786754452628226e-05, "loss": 0.0071, "num_tokens": 14765898.0, "reward": -1.29864501953125, "reward_std": 0.9905369281768799, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -5.9410400390625, "rewards/ppl_reward/std": 6.079434871673584, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.13363061845302582, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 341.546875, "completions/mean_terminated_length": 341.546875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.7980505635089857, "grad_norm": 1.0848841667175293, "kl": 0.544921875, "learning_rate": 1.9785002063079577e-05, "loss": 0.0042, "num_tokens": 14794349.0, "reward": -4.298095703125, "reward_std": 2.5364081859588623, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -11.97900390625, "rewards/ppl_reward/std": 15.409607887268066, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.09506355226039886, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 319.34375, "completions/mean_terminated_length": 319.34375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.7992689613158697, "grad_norm": 1.3508855104446411, "kl": 0.5908203125, "learning_rate": 1.9783242580891067e-05, "loss": 0.0174, "num_tokens": 14821187.0, "reward": -3.763916015625, "reward_std": 0.88812655210495, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -10.90283203125, "rewards/ppl_reward/std": 6.900487422943115, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.14773420989513397, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 332.625, "completions/mean_terminated_length": 332.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.8004873591227536, "grad_norm": 1.0282152891159058, "kl": 0.591796875, "learning_rate": 1.9781476007338058e-05, "loss": 0.0505, "num_tokens": 14848923.0, "reward": -5.910888671875, "reward_std": 1.8670094013214111, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -15.39208984375, "rewards/ppl_reward/std": 18.471986770629883, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.104981929063797, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 306.421875, "completions/mean_terminated_length": 306.421875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.8017057569296375, "grad_norm": 1.5029844045639038, "kl": 0.6787109375, "learning_rate": 1.9779702343701045e-05, "loss": 0.1013, "num_tokens": 14876798.0, "reward": -1.214599609375, "reward_std": 0.48749569058418274, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.17138671875, "rewards/ppl_reward/std": 3.9855592250823975, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 256.765625, "completions/mean_terminated_length": 256.765625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.8029241547365215, "grad_norm": 1.295427680015564, "kl": 0.8193359375, "learning_rate": 1.977792159126566e-05, "loss": 0.0056, "num_tokens": 14900095.0, "reward": -0.596435546875, "reward_std": 0.3789346218109131, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.00537109375, "rewards/ppl_reward/std": 2.385075330734253, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 220.390625, "completions/mean_terminated_length": 220.390625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.8041425525434054, "grad_norm": 1.3908065557479858, "kl": 0.7861328125, "learning_rate": 1.9776133751322682e-05, "loss": -0.0227, "num_tokens": 14921208.0, "reward": -2.3641357421875, "reward_std": 0.9573438763618469, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.376708984375, "rewards/ppl_reward/std": 4.550209999084473, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10789459943771362, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 216.09375, "completions/mean_terminated_length": 216.09375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.8053609503502893, "grad_norm": 1.4303606748580933, "kl": 0.8046875, "learning_rate": 1.9774338825168024e-05, "loss": -0.0347, "num_tokens": 14942006.0, "reward": -2.1158447265625, "reward_std": 0.566325306892395, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.098876953125, "rewards/ppl_reward/std": 6.589327812194824, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 211.828125, "completions/mean_terminated_length": 211.828125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.8065793481571734, "grad_norm": 1.570719599723816, "kl": 1.017578125, "learning_rate": 1.977253681410273e-05, "loss": -0.0191, "num_tokens": 14962947.0, "reward": -1.9080810546875, "reward_std": 0.6275389790534973, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.573974609375, "rewards/ppl_reward/std": 5.141388893127441, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 206.46875, "completions/mean_terminated_length": 206.46875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.8077977459640573, "grad_norm": 2.5206570625305176, "kl": 1.001953125, "learning_rate": 1.9770727719432994e-05, "loss": 0.0338, "num_tokens": 14982425.0, "reward": -1.53759765625, "reward_std": 0.5092467665672302, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -7.0126953125, "rewards/ppl_reward/std": 4.782896041870117, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 171.71875, "completions/mean_terminated_length": 171.71875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.8090161437709412, "grad_norm": 1.7089922428131104, "kl": 1.0205078125, "learning_rate": 1.9768911542470125e-05, "loss": -0.0172, "num_tokens": 15000183.0, "reward": -4.9024658203125, "reward_std": 1.6234406232833862, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -13.539306640625, "rewards/ppl_reward/std": 16.903244018554688, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07344620674848557, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 197.328125, "completions/mean_terminated_length": 197.328125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.8102345415778252, "grad_norm": 1.8802614212036133, "kl": 1.369140625, "learning_rate": 1.976708828453058e-05, "loss": 0.025, "num_tokens": 15020196.0, "reward": -2.0452880859375, "reward_std": 0.46917036175727844, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.949951171875, "rewards/ppl_reward/std": 3.373485565185547, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 198.0625, "completions/mean_terminated_length": 198.0625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.8114529393847091, "grad_norm": 1.5550163984298706, "kl": 1.0, "learning_rate": 1.9765257946935944e-05, "loss": -0.0214, "num_tokens": 15039320.0, "reward": -3.279296875, "reward_std": 0.8524007797241211, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -10.48828125, "rewards/ppl_reward/std": 6.469256401062012, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 214.5625, "completions/mean_terminated_length": 214.5625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.812671337191593, "grad_norm": 2.1453964710235596, "kl": 1.080078125, "learning_rate": 1.9763420531012933e-05, "loss": -0.0286, "num_tokens": 15059780.0, "reward": -2.26885986328125, "reward_std": 0.6719491481781006, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.3502197265625, "rewards/ppl_reward/std": 7.348599910736084, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 248.515625, "completions/mean_terminated_length": 248.515625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.813889734998477, "grad_norm": 1.3886051177978516, "kl": 0.8515625, "learning_rate": 1.9761576038093394e-05, "loss": -0.0005, "num_tokens": 15082613.0, "reward": -2.2607421875, "reward_std": 0.39604711532592773, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -8.521484375, "rewards/ppl_reward/std": 8.243535041809082, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 245.0625, "completions/mean_terminated_length": 245.0625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.815108132805361, "grad_norm": 1.5377410650253296, "kl": 1.0029296875, "learning_rate": 1.9759724469514302e-05, "loss": -0.0358, "num_tokens": 15104945.0, "reward": -3.53759765625, "reward_std": 0.9083170890808105, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -11.0361328125, "rewards/ppl_reward/std": 7.796808242797852, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 250.140625, "completions/mean_terminated_length": 250.140625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.8163265306122449, "grad_norm": 1.2372145652770996, "kl": 0.767578125, "learning_rate": 1.975786582661777e-05, "loss": -0.0262, "num_tokens": 15127802.0, "reward": -0.58160400390625, "reward_std": 0.23054184019565582, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.1319580078125, "rewards/ppl_reward/std": 2.7427682876586914, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 267.46875, "completions/mean_terminated_length": 267.46875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.8175449284191288, "grad_norm": 1.1693583726882935, "kl": 0.7509765625, "learning_rate": 1.9756000110751023e-05, "loss": -0.0365, "num_tokens": 15151376.0, "reward": -2.0362548828125, "reward_std": 0.3920832574367523, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -8.033447265625, "rewards/ppl_reward/std": 3.82059645652771, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 286.328125, "completions/mean_terminated_length": 286.328125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.8187633262260128, "grad_norm": 1.052513837814331, "kl": 0.712890625, "learning_rate": 1.9754127323266426e-05, "loss": 0.0506, "num_tokens": 15177029.0, "reward": -1.7191162109375, "reward_std": 0.40292996168136597, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.399169921875, "rewards/ppl_reward/std": 5.476434707641602, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 320.203125, "completions/mean_terminated_length": 320.203125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.8199817240328967, "grad_norm": 1.2477359771728516, "kl": 0.728515625, "learning_rate": 1.9752247465521467e-05, "loss": 0.0523, "num_tokens": 15205130.0, "reward": -1.0308837890625, "reward_std": 0.249640554189682, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.022705078125, "rewards/ppl_reward/std": 5.41269063949585, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 293.15625, "completions/mean_terminated_length": 293.15625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.8212001218397806, "grad_norm": 1.1247583627700806, "kl": 0.677734375, "learning_rate": 1.9750360538878753e-05, "loss": 0.0522, "num_tokens": 15230668.0, "reward": -1.35400390625, "reward_std": 0.38934046030044556, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.5908203125, "rewards/ppl_reward/std": 3.785442352294922, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 273.40625, "completions/mean_terminated_length": 273.40625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.8224185196466647, "grad_norm": 1.0678062438964844, "kl": 0.6796875, "learning_rate": 1.9748466544706024e-05, "loss": 0.0145, "num_tokens": 15255694.0, "reward": -0.953369140625, "reward_std": 0.2349916249513626, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.83642578125, "rewards/ppl_reward/std": 1.963236689567566, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 244.359375, "completions/mean_terminated_length": 244.359375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.8236369174535486, "grad_norm": 1.1502426862716675, "kl": 0.6982421875, "learning_rate": 1.9746565484376132e-05, "loss": 0.0313, "num_tokens": 15278061.0, "reward": -0.6246337890625, "reward_std": 0.2085776925086975, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.218017578125, "rewards/ppl_reward/std": 3.205723762512207, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 239.125, "completions/mean_terminated_length": 239.125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.8248553152604325, "grad_norm": 1.0212026834487915, "kl": 0.7646484375, "learning_rate": 1.9744657359267063e-05, "loss": -0.0661, "num_tokens": 15300093.0, "reward": -2.2406005859375, "reward_std": 0.6381947994232178, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.199951171875, "rewards/ppl_reward/std": 4.463400840759277, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 213.9375, "completions/mean_terminated_length": 213.9375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.8260737130673165, "grad_norm": 1.8690778017044067, "kl": 0.8388671875, "learning_rate": 1.974274217076191e-05, "loss": 0.0027, "num_tokens": 15320673.0, "reward": -0.6715087890625, "reward_std": 0.668948233127594, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.968017578125, "rewards/ppl_reward/std": 2.4239940643310547, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1883249133825302, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 211.71875, "completions/mean_terminated_length": 211.71875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.8272921108742004, "grad_norm": 2.154574155807495, "kl": 1.1142578125, "learning_rate": 1.9740819920248904e-05, "loss": -0.0214, "num_tokens": 15341335.0, "reward": -1.3907470703125, "reward_std": 0.7713679671287537, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.289306640625, "rewards/ppl_reward/std": 4.7540974617004395, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.13903142511844635, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 219.234375, "completions/mean_terminated_length": 219.234375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.8285105086810843, "grad_norm": 1.5000311136245728, "kl": 0.9541015625, "learning_rate": 1.973889060912138e-05, "loss": -0.0624, "num_tokens": 15362430.0, "reward": -1.85498046875, "reward_std": 0.6145289540290833, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.2333984375, "rewards/ppl_reward/std": 2.4856059551239014, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.11935414373874664, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 225.109375, "completions/mean_terminated_length": 225.109375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.8297289064879684, "grad_norm": 1.3755296468734741, "kl": 1.18359375, "learning_rate": 1.9736954238777793e-05, "loss": -0.049, "num_tokens": 15383749.0, "reward": -2.4642333984375, "reward_std": 0.7409934997558594, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -8.381591796875, "rewards/ppl_reward/std": 7.991583824157715, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1041666716337204, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 259.74603271484375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.8309473042948523, "grad_norm": 1.8601641654968262, "kl": 3.2763671875, "learning_rate": 1.973501081062172e-05, "loss": 0.1989, "num_tokens": 15408657.0, "reward": -2.6624755859375, "reward_std": 0.6507904529571533, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -8.817138671875, "rewards/ppl_reward/std": 6.337563991546631, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1620931327342987, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 261.53125, "completions/mean_terminated_length": 249.4285888671875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.8321657021017362, "grad_norm": 1.7991868257522583, "kl": 2.322265625, "learning_rate": 1.9733060326061846e-05, "loss": 0.136, "num_tokens": 15432363.0, "reward": -0.4967041015625, "reward_std": 0.6218917369842529, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.610595703125, "rewards/ppl_reward/std": 2.1956450939178467, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15545432269573212, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 342.34375, "completions/mean_terminated_length": 320.3548278808594, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.8333840999086202, "grad_norm": 8.054475784301758, "kl": 11.78515625, "learning_rate": 1.9731102786511983e-05, "loss": 0.6223, "num_tokens": 15460553.0, "reward": -4.8887939453125, "reward_std": 4.1555070877075195, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.4732423722743988, "rewards/ppl_reward/mean": -12.855712890625, "rewards/ppl_reward/std": 27.123376846313477, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.25185325741767883, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 308.125, "completions/mean_terminated_length": 296.7619323730469, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.8346024977155041, "grad_norm": 5.196903705596924, "kl": 8.02734375, "learning_rate": 1.972913819339105e-05, "loss": 0.4659, "num_tokens": 15487449.0, "reward": -1.7864990234375, "reward_std": 0.884092390537262, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4787135720252991, "rewards/ppl_reward/mean": -6.635498046875, "rewards/ppl_reward/std": 4.206363201141357, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2136233150959015, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 303.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.835820895522388, "grad_norm": 2.6381635665893555, "kl": 2.99609375, "learning_rate": 1.972716654812307e-05, "loss": 0.1214, "num_tokens": 15513073.0, "reward": -2.9718017578125, "reward_std": 1.3422011137008667, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -9.545166015625, "rewards/ppl_reward/std": 14.646021842956543, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11404092609882355, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 333.90625, "completions/mean_terminated_length": 333.90625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.837039293329272, "grad_norm": 3.7735531330108643, "kl": 7.369140625, "learning_rate": 1.9725187852137195e-05, "loss": 0.3947, "num_tokens": 15540819.0, "reward": -1.58648681640625, "reward_std": 1.0959794521331787, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -6.3604736328125, "rewards/ppl_reward/std": 4.729855060577393, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2357022762298584, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 315.109375, "completions/mean_terminated_length": 303.8571472167969, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.838257691136156, "grad_norm": 4.1159892082214355, "kl": 4.28125, "learning_rate": 1.9723202106867674e-05, "loss": 0.256, "num_tokens": 15567650.0, "reward": -1.3802490234375, "reward_std": 0.9243065714836121, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.151123046875, "rewards/ppl_reward/std": 2.1614809036254883, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2345155030488968, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 376.515625, "completions/mean_terminated_length": 344.672119140625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.8394760889430399, "grad_norm": 7.737274169921875, "kl": 5.32421875, "learning_rate": 1.9721209313753873e-05, "loss": 0.5099, "num_tokens": 15598947.0, "reward": -1.015625, "reward_std": 0.8308223485946655, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.4296875, "rewards/ppl_reward/std": 2.579362630844116, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.19444002211093903, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 339.6875, "completions/mean_terminated_length": 306.03277587890625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.8406944867499239, "grad_norm": 7.438424587249756, "kl": 6.4140625, "learning_rate": 1.9719209474240263e-05, "loss": 0.5708, "num_tokens": 15627303.0, "reward": -1.478759765625, "reward_std": 1.0430904626846313, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.37939453125, "rewards/ppl_reward/std": 3.7481677532196045, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.24282869696617126, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 339.15625, "completions/mean_terminated_length": 317.06451416015625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.8419128845568078, "grad_norm": 2.571540355682373, "kl": 7.125, "learning_rate": 1.9717202589776424e-05, "loss": 0.4972, "num_tokens": 15655753.0, "reward": -1.3358154296875, "reward_std": 0.5554416179656982, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.382568359375, "rewards/ppl_reward/std": 1.750314474105835, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 367.453125, "completions/mean_terminated_length": 299.53448486328125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.8431312823636917, "grad_norm": 5.863584041595459, "kl": 12.802734375, "learning_rate": 1.9715188661817045e-05, "loss": 0.909, "num_tokens": 15686110.0, "reward": -1.89654541015625, "reward_std": 1.3308627605438232, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.2462158203125, "rewards/ppl_reward/std": 6.219181537628174, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.31800705194473267, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 399.171875, "completions/mean_terminated_length": 309.9107360839844, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.8443496801705757, "grad_norm": 10.309167861938477, "kl": 15.5625, "learning_rate": 1.9713167691821914e-05, "loss": 0.9582, "num_tokens": 15718977.0, "reward": -3.843505859375, "reward_std": 2.2039966583251953, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -10.76513671875, "rewards/ppl_reward/std": 6.929214954376221, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.36587780714035034, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 407.265625, "completions/mean_terminated_length": 306.345458984375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.8455680779774596, "grad_norm": 5.7661590576171875, "kl": 6.390625, "learning_rate": 1.971113968125593e-05, "loss": 0.6132, "num_tokens": 15751946.0, "reward": -2.6785888671875, "reward_std": 1.389505386352539, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.614990234375, "rewards/ppl_reward/std": 6.739162445068359, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.34840819239616394, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 386.921875, "completions/mean_terminated_length": 308.6842041015625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.8467864757843436, "grad_norm": 13.60916519165039, "kl": 5.9453125, "learning_rate": 1.9709104631589092e-05, "loss": 0.6068, "num_tokens": 15784669.0, "reward": -2.6376953125, "reward_std": 1.602447748184204, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.642578125, "rewards/ppl_reward/std": 6.432817459106445, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.30473998188972473, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 383.484375, "completions/mean_terminated_length": 291.9821472167969, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.8480048735912276, "grad_norm": 2.7555582523345947, "kl": 8.59375, "learning_rate": 1.9707062544296497e-05, "loss": 0.7004, "num_tokens": 15816020.0, "reward": -2.6094970703125, "reward_std": 1.3693569898605347, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.656494140625, "rewards/ppl_reward/std": 3.5530591011047363, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.2847827076911926, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 346.609375, "completions/mean_terminated_length": 276.53448486328125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.8492232713981115, "grad_norm": 8.666808128356934, "kl": 13.212890625, "learning_rate": 1.9705013420858353e-05, "loss": 0.8925, "num_tokens": 15844771.0, "reward": -2.182861328125, "reward_std": 1.491417646408081, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.82666015625, "rewards/ppl_reward/std": 5.922261714935303, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.31435462832450867, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 333.265625, "completions/mean_terminated_length": 248.4385986328125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.8504416692049954, "grad_norm": 11.948846817016602, "kl": 15.505859375, "learning_rate": 1.9702957262759964e-05, "loss": 1.0279, "num_tokens": 15872644.0, "reward": -1.9771728515625, "reward_std": 1.9288541078567505, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.337158203125, "rewards/ppl_reward/std": 5.796276569366455, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.30514663457870483, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 298.71875, "completions/mean_terminated_length": 250.36668395996094, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.8516600670118794, "grad_norm": 7.605169296264648, "kl": 8.49609375, "learning_rate": 1.9700894071491736e-05, "loss": 0.8089, "num_tokens": 15898882.0, "reward": -0.714111328125, "reward_std": 0.8452438116073608, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.95947265625, "rewards/ppl_reward/std": 2.777832269668579, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.26679685711860657, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 256.453125, "completions/mean_terminated_length": 244.2698516845703, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.8528784648187633, "grad_norm": 2.572305917739868, "kl": 3.396484375, "learning_rate": 1.9698823848549155e-05, "loss": 0.2436, "num_tokens": 15921727.0, "reward": -5.72467041015625, "reward_std": 0.9408194422721863, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -15.1759033203125, "rewards/ppl_reward/std": 28.410964965820312, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.21675680577754974, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 322.953125, "completions/mean_terminated_length": 276.2166748046875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.8540968626256473, "grad_norm": 2.7520666122436523, "kl": 9.029296875, "learning_rate": 1.9696746595432828e-05, "loss": 0.6539, "num_tokens": 15949908.0, "reward": -1.231689453125, "reward_std": 1.2100510597229004, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.89306640625, "rewards/ppl_reward/std": 3.976109743118286, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.2382858246564865, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 324.140625, "completions/mean_terminated_length": 238.19297790527344, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.8553152604325313, "grad_norm": 5.591440677642822, "kl": 10.56640625, "learning_rate": 1.969466231364845e-05, "loss": 0.9411, "num_tokens": 15977149.0, "reward": -1.3358154296875, "reward_std": 1.3864834308624268, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.132568359375, "rewards/ppl_reward/std": 3.459707021713257, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.3180801570415497, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 237.109375, "completions/mean_terminated_length": 224.6190643310547, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.8565336582394152, "grad_norm": 2.456195831298828, "kl": 2.765625, "learning_rate": 1.9692571004706805e-05, "loss": 0.1896, "num_tokens": 15998916.0, "reward": -2.6588134765625, "reward_std": 1.7772910594940186, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.856689453125, "rewards/ppl_reward/std": 7.003198146820068, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18225978314876556, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 259.921875, "completions/mean_terminated_length": 208.98333740234375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.8577520560462991, "grad_norm": 7.729458808898926, "kl": 10.4736328125, "learning_rate": 1.969047267012377e-05, "loss": 0.8076, "num_tokens": 16021583.0, "reward": -1.26983642578125, "reward_std": 1.2620503902435303, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.1881103515625, "rewards/ppl_reward/std": 3.787865400314331, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2599400579929352, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 247.3125, "completions/mean_terminated_length": 222.258056640625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.858970453853183, "grad_norm": 5.414856910705566, "kl": 5.24609375, "learning_rate": 1.968836731142033e-05, "loss": 0.4853, "num_tokens": 16044187.0, "reward": -3.23388671875, "reward_std": 1.0691543817520142, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -10.1474609375, "rewards/ppl_reward/std": 9.572639465332031, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21931616961956024, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 243.0, "completions/mean_terminated_length": 230.6031951904297, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.860188851660067, "grad_norm": 4.816585063934326, "kl": 3.6337890625, "learning_rate": 1.968625493012254e-05, "loss": 0.1657, "num_tokens": 16066899.0, "reward": -3.428466796875, "reward_std": 2.660465717315674, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -10.63037109375, "rewards/ppl_reward/std": 15.723665237426758, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.20009763538837433, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 278.875, "completions/mean_terminated_length": 229.20001220703125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.8614072494669509, "grad_norm": 22.23157501220703, "kl": 15.5546875, "learning_rate": 1.968413552776156e-05, "loss": 0.9231, "num_tokens": 16092187.0, "reward": -2.892333984375, "reward_std": 2.396010637283325, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -9.45654296875, "rewards/ppl_reward/std": 16.09578514099121, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.24587368965148926, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 246.28125, "completions/mean_terminated_length": 208.03277587890625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.862625647273835, "grad_norm": 8.374480247497559, "kl": 8.0087890625, "learning_rate": 1.9682009105873633e-05, "loss": 0.6569, "num_tokens": 16114917.0, "reward": -2.6046142578125, "reward_std": 1.0652236938476562, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.943603515625, "rewards/ppl_reward/std": 4.564883708953857, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.21578919887542725, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 259.25, "completions/mean_terminated_length": 208.2666778564453, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.8638440450807189, "grad_norm": 7.414191246032715, "kl": 5.88671875, "learning_rate": 1.967987566600009e-05, "loss": 0.5957, "num_tokens": 16138421.0, "reward": -0.8582763671875, "reward_std": 1.0571372509002686, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.333740234375, "rewards/ppl_reward/std": 3.1938321590423584, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2237938940525055, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 225.296875, "completions/mean_terminated_length": 225.296875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.8650624428876028, "grad_norm": 1.2169866561889648, "kl": 1.0654296875, "learning_rate": 1.9677735209687362e-05, "loss": -0.0271, "num_tokens": 16160888.0, "reward": -2.6138916015625, "reward_std": 0.6723801493644714, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.899658203125, "rewards/ppl_reward/std": 10.387612342834473, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 254.875, "completions/mean_terminated_length": 203.60000610351562, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.8662808406944867, "grad_norm": 31.32073211669922, "kl": 27.32421875, "learning_rate": 1.9675587738486935e-05, "loss": 1.5265, "num_tokens": 16184144.0, "reward": -1.1461181640625, "reward_std": 1.038118839263916, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.831298828125, "rewards/ppl_reward/std": 2.9807991981506348, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2649018466472626, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 305.78125, "completions/mean_terminated_length": 231.48275756835938, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.8674992385013707, "grad_norm": 31.012149810791016, "kl": 32.5, "learning_rate": 1.967343325395542e-05, "loss": 1.7766, "num_tokens": 16211570.0, "reward": -0.815673828125, "reward_std": 1.1715409755706787, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.06884765625, "rewards/ppl_reward/std": 2.3302767276763916, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.29504841566085815, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 291.296875, "completions/mean_terminated_length": 201.3157958984375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.8687176363082546, "grad_norm": 25.40196418762207, "kl": 29.1875, "learning_rate": 1.9671271757654474e-05, "loss": 1.7614, "num_tokens": 16237157.0, "reward": -3.1197509765625, "reward_std": 2.7399091720581055, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -9.692626953125, "rewards/ppl_reward/std": 9.767990112304688, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.31487196683883667, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 229.28125, "completions/mean_terminated_length": 216.6666717529297, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.8699360341151386, "grad_norm": 4.406871318817139, "kl": 6.26171875, "learning_rate": 1.9669103251150865e-05, "loss": 0.4547, "num_tokens": 16259735.0, "reward": -1.2781982421875, "reward_std": 0.7266424298286438, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.056396484375, "rewards/ppl_reward/std": 2.421243190765381, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1666666716337204, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 289.671875, "completions/mean_terminated_length": 253.55735778808594, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.8711544319220226, "grad_norm": 6.617678165435791, "kl": 8.984375, "learning_rate": 1.9666927736016425e-05, "loss": 0.6595, "num_tokens": 16286234.0, "reward": -1.605712890625, "reward_std": 1.2234485149383545, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.57861328125, "rewards/ppl_reward/std": 3.383298397064209, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.2886483073234558, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 233.0625, "completions/mean_terminated_length": 220.50794982910156, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.8723728297289065, "grad_norm": 7.243121147155762, "kl": 11.25, "learning_rate": 1.9664745213828075e-05, "loss": 0.9563, "num_tokens": 16308206.0, "reward": -0.807861328125, "reward_std": 0.9996618628501892, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.27197265625, "rewards/ppl_reward/std": 2.745445728302002, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.24346621334552765, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 224.71875, "completions/mean_terminated_length": 198.9354705810547, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.8735912275357904, "grad_norm": 3.0907955169677734, "kl": 4.96484375, "learning_rate": 1.9662555686167808e-05, "loss": 0.3773, "num_tokens": 16329556.0, "reward": -1.549072265625, "reward_std": 1.3608596324920654, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.85595703125, "rewards/ppl_reward/std": 6.149107456207275, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 266.078125, "completions/mean_terminated_length": 228.8032684326172, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.8748096253426744, "grad_norm": 5.765261650085449, "kl": 11.9990234375, "learning_rate": 1.9660359154622704e-05, "loss": 0.7697, "num_tokens": 16353473.0, "reward": -2.0257568359375, "reward_std": 1.0019563436508179, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.684326171875, "rewards/ppl_reward/std": 4.497971534729004, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2257249802350998, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 227.359375, "completions/mean_terminated_length": 227.359375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.8760280231495583, "grad_norm": 1.7312462329864502, "kl": 3.34375, "learning_rate": 1.9658155620784912e-05, "loss": 0.2001, "num_tokens": 16375392.0, "reward": -1.60791015625, "reward_std": 0.8404171466827393, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.9345703125, "rewards/ppl_reward/std": 4.3089752197265625, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 237.21875, "completions/mean_terminated_length": 224.7301788330078, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.8772464209564422, "grad_norm": 2.1698570251464844, "kl": 5.0625, "learning_rate": 1.9655945086251656e-05, "loss": 0.2539, "num_tokens": 16397542.0, "reward": -0.8839111328125, "reward_std": 0.7644879221916199, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.392822265625, "rewards/ppl_reward/std": 2.549687385559082, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2514837086200714, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 317.6875, "completions/mean_terminated_length": 244.6206817626953, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.8784648187633263, "grad_norm": 4.232175350189209, "kl": 10.65234375, "learning_rate": 1.9653727552625242e-05, "loss": 0.8136, "num_tokens": 16424602.0, "reward": -3.445556640625, "reward_std": 2.8410727977752686, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -10.32861328125, "rewards/ppl_reward/std": 13.283537864685059, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.29839184880256653, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 353.875, "completions/mean_terminated_length": 199.23077392578125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.8796832165702102, "grad_norm": 8.728758811950684, "kl": 16.3271484375, "learning_rate": 1.9651503021513046e-05, "loss": 1.0744, "num_tokens": 16454810.0, "reward": -4.503662109375, "reward_std": 2.5516464710235596, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -11.91357421875, "rewards/ppl_reward/std": 19.844120025634766, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.43615156412124634, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 388.71875, "completions/mean_terminated_length": 226.78431701660156, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.8809016143770941, "grad_norm": 5.314696788787842, "kl": 13.3828125, "learning_rate": 1.964927149452751e-05, "loss": 1.0697, "num_tokens": 16485888.0, "reward": -2.4122314453125, "reward_std": 2.5513224601745605, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -7.980712890625, "rewards/ppl_reward/std": 7.037054538726807, "rewards/tag_count_reward/mean": 0.796875, "rewards/tag_count_reward/std": 0.3980981707572937, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 338.09375, "completions/mean_terminated_length": 225.8545379638672, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.8821200121839781, "grad_norm": 14.387325286865234, "kl": 11.5390625, "learning_rate": 1.9647032973286157e-05, "loss": 1.1583, "num_tokens": 16515118.0, "reward": -1.29132080078125, "reward_std": 1.7870235443115234, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.9420166015625, "rewards/ppl_reward/std": 4.098897933959961, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.34993976354599, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 360.15625, "completions/mean_terminated_length": 251.5272674560547, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.883338409990862, "grad_norm": 8.918397903442383, "kl": 13.28125, "learning_rate": 1.964478745941157e-05, "loss": 1.1204, "num_tokens": 16545256.0, "reward": -1.63916015625, "reward_std": 2.0255637168884277, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.5673828125, "rewards/ppl_reward/std": 5.093134880065918, "rewards/tag_count_reward/mean": 0.83203125, "rewards/tag_count_reward/std": 0.3592725396156311, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 410.3125, "completions/mean_terminated_length": 205.75, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.8845568077977459, "grad_norm": 20.639583587646484, "kl": 30.0, "learning_rate": 1.964253495453141e-05, "loss": 1.9577, "num_tokens": 16578532.0, "reward": -1.336181640625, "reward_std": 1.925990343093872, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -5.64111328125, "rewards/ppl_reward/std": 3.091517210006714, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.427246630191803, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 373.5625, "completions/mean_terminated_length": 223.4615478515625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.88577520560463, "grad_norm": 24.072404861450195, "kl": 30.25, "learning_rate": 1.9640275460278402e-05, "loss": 1.9159, "num_tokens": 16609176.0, "reward": -5.3646240234375, "reward_std": 4.503668785095215, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -13.940185546875, "rewards/ppl_reward/std": 11.675328254699707, "rewards/tag_count_reward/mean": 0.80859375, "rewards/tag_count_reward/std": 0.3927472233772278, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 327.59375, "completions/mean_terminated_length": 255.55172729492188, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.8869936034115139, "grad_norm": 7.136646747589111, "kl": 13.5517578125, "learning_rate": 1.963800897829033e-05, "loss": 0.8525, "num_tokens": 16637774.0, "reward": -4.21923828125, "reward_std": 2.7722244262695312, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -11.9228515625, "rewards/ppl_reward/std": 11.680994033813477, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.29452258348464966, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 394.8125, "completions/mean_terminated_length": 249.61538696289062, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.8882120012183978, "grad_norm": 3.1222164630889893, "kl": 11.181640625, "learning_rate": 1.9635735510210053e-05, "loss": 0.8654, "num_tokens": 16670282.0, "reward": -2.669921875, "reward_std": 2.5410208702087402, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -8.51171875, "rewards/ppl_reward/std": 10.122897148132324, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.38696199655532837, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 301.171875, "completions/mean_terminated_length": 252.9833526611328, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.8894303990252818, "grad_norm": 10.256717681884766, "kl": 5.2763671875, "learning_rate": 1.963345505768549e-05, "loss": 0.5225, "num_tokens": 16696189.0, "reward": -1.0140380859375, "reward_std": 0.9400810599327087, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.692138671875, "rewards/ppl_reward/std": 4.850450038909912, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2507120370864868, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 279.34375, "completions/mean_terminated_length": 255.32257080078125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.8906487968321657, "grad_norm": 4.223058223724365, "kl": 3.2529296875, "learning_rate": 1.9631167622369617e-05, "loss": 0.3109, "num_tokens": 16720963.0, "reward": -1.0211181640625, "reward_std": 0.4998782277107239, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.862548828125, "rewards/ppl_reward/std": 2.6185972690582275, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18662993609905243, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 268.25, "completions/mean_terminated_length": 243.87095642089844, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.8918671946390496, "grad_norm": 2.2218594551086426, "kl": 3.9697265625, "learning_rate": 1.9628873205920486e-05, "loss": 0.176, "num_tokens": 16745539.0, "reward": -1.7724609375, "reward_std": 1.0943872928619385, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.169921875, "rewards/ppl_reward/std": 4.558294296264648, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.23517554998397827, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 317.140625, "completions/mean_terminated_length": 282.3770446777344, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.8930855924459336, "grad_norm": 5.594184875488281, "kl": 4.93896484375, "learning_rate": 1.9626571810001195e-05, "loss": 0.4847, "num_tokens": 16773084.0, "reward": -4.3125, "reward_std": 3.891305685043335, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -12.3359375, "rewards/ppl_reward/std": 20.06177520751953, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.244957834482193, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 265.8125, "completions/mean_terminated_length": 241.35482788085938, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.8943039902528176, "grad_norm": 2.866795301437378, "kl": 6.4970703125, "learning_rate": 1.9624263436279908e-05, "loss": 0.4045, "num_tokens": 16797208.0, "reward": -0.70556640625, "reward_std": 0.642298698425293, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.2470703125, "rewards/ppl_reward/std": 2.560636043548584, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 306.828125, "completions/mean_terminated_length": 246.05084228515625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.8955223880597015, "grad_norm": 11.055034637451172, "kl": 16.359375, "learning_rate": 1.9621948086429847e-05, "loss": 1.0459, "num_tokens": 16824597.0, "reward": -0.8349609375, "reward_std": 1.0072520971298218, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.302734375, "rewards/ppl_reward/std": 2.0748534202575684, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2722889184951782, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 272.453125, "completions/mean_terminated_length": 222.35000610351562, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.8967407858665855, "grad_norm": 6.486608505249023, "kl": 11.728515625, "learning_rate": 1.961962576212928e-05, "loss": 0.7833, "num_tokens": 16849786.0, "reward": -2.17041015625, "reward_std": 2.455887794494629, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.9033203125, "rewards/ppl_reward/std": 11.799509048461914, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.2763853967189789, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 274.640625, "completions/mean_terminated_length": 250.4677276611328, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.8979591836734694, "grad_norm": 5.752386093139648, "kl": 4.1572265625, "learning_rate": 1.9617296465061554e-05, "loss": 0.3591, "num_tokens": 16874019.0, "reward": -1.7178955078125, "reward_std": 1.4494802951812744, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.138916015625, "rewards/ppl_reward/std": 5.021122932434082, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 291.953125, "completions/mean_terminated_length": 268.3387145996094, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.8991775814803533, "grad_norm": 4.591813564300537, "kl": 4.078125, "learning_rate": 1.961496019691504e-05, "loss": 0.2676, "num_tokens": 16900776.0, "reward": -0.286865234375, "reward_std": 0.5889835953712463, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.32373046875, "rewards/ppl_reward/std": 1.5797423124313354, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1883249133825302, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 221.80001831054688, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.9003959792872372, "grad_norm": 1.922513723373413, "kl": 9.158203125, "learning_rate": 1.961261695938319e-05, "loss": 0.7008, "num_tokens": 16925260.0, "reward": -7.736572265625, "reward_std": 2.6428961753845215, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -19.22314453125, "rewards/ppl_reward/std": 31.085763931274414, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.24397502839565277, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 233.40625, "completions/mean_terminated_length": 220.85716247558594, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.9016143770941212, "grad_norm": 2.911388397216797, "kl": 5.845703125, "learning_rate": 1.961026675416449e-05, "loss": 0.3755, "num_tokens": 16947086.0, "reward": -2.4630126953125, "reward_std": 1.4903113842010498, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.715087890625, "rewards/ppl_reward/std": 6.251905918121338, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 293.015625, "completions/mean_terminated_length": 203.24562072753906, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.9028327749010052, "grad_norm": 15.942695617675781, "kl": 19.078125, "learning_rate": 1.9607909582962478e-05, "loss": 1.2033, "num_tokens": 16972767.0, "reward": -3.533447265625, "reward_std": 4.284459114074707, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -10.45751953125, "rewards/ppl_reward/std": 19.793209075927734, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.3117053508758545, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 294.984375, "completions/mean_terminated_length": 246.3833465576172, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9040511727078892, "grad_norm": 8.640076637268066, "kl": 8.248046875, "learning_rate": 1.960554544748575e-05, "loss": 0.5577, "num_tokens": 16998142.0, "reward": -5.3387451171875, "reward_std": 2.417410135269165, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -14.255615234375, "rewards/ppl_reward/std": 29.344223022460938, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2366211861371994, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 357.90625, "completions/mean_terminated_length": 234.55555725097656, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.9052695705147731, "grad_norm": 5.399694919586182, "kl": 10.2890625, "learning_rate": 1.9603174349447946e-05, "loss": 0.5697, "num_tokens": 17027232.0, "reward": -2.2760009765625, "reward_std": 1.987594485282898, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4787135720252991, "rewards/ppl_reward/mean": -7.466064453125, "rewards/ppl_reward/std": 5.2188239097595215, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.3308829367160797, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 470.359375, "completions/mean_terminated_length": 236.60000610351562, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.906487968321657, "grad_norm": 29.64480972290039, "kl": 6.82421875, "learning_rate": 1.9600796290567747e-05, "loss": 0.8163, "num_tokens": 17064247.0, "reward": -1.3677978515625, "reward_std": 1.43723464012146, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.4732423722743988, "rewards/ppl_reward/mean": -5.634033203125, "rewards/ppl_reward/std": 2.776031017303467, "rewards/tag_count_reward/mean": 0.77734375, "rewards/tag_count_reward/std": 0.3538820743560791, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 353.0, "completions/mean_terminated_length": 257.14288330078125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.9077063661285409, "grad_norm": 11.447883605957031, "kl": 3.57421875, "learning_rate": 1.9598411272568892e-05, "loss": 0.3294, "num_tokens": 17094135.0, "reward": -1.74169921875, "reward_std": 1.3324377536773682, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.9052734375, "rewards/ppl_reward/std": 4.909593105316162, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2816080152988434, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 240.515625, "completions/mean_terminated_length": 215.24192810058594, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.9089247639354249, "grad_norm": 5.955644130706787, "kl": 3.8583984375, "learning_rate": 1.9596019297180146e-05, "loss": 0.4423, "num_tokens": 17116176.0, "reward": -1.61029052734375, "reward_std": 1.6984868049621582, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.8924560546875, "rewards/ppl_reward/std": 7.277982234954834, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16347388923168182, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 376.265625, "completions/mean_terminated_length": 226.78846740722656, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.9101431617423089, "grad_norm": 8.553918838500977, "kl": 16.140625, "learning_rate": 1.9593620366135338e-05, "loss": 1.2852, "num_tokens": 17146969.0, "reward": -2.247802734375, "reward_std": 1.7560310363769531, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -7.66748046875, "rewards/ppl_reward/std": 5.038455486297607, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.28770697116851807, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 462.859375, "completions/mean_terminated_length": 225.93333435058594, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.9113615595491928, "grad_norm": 25.3290958404541, "kl": 27.375, "learning_rate": 1.959121448117332e-05, "loss": 1.6542, "num_tokens": 17183328.0, "reward": -4.5133056640625, "reward_std": 3.4683656692504883, "rewards/format_reward/mean": 0.640625, "rewards/format_reward/std": 0.4836103618144989, "rewards/ppl_reward/mean": -11.854736328125, "rewards/ppl_reward/std": 13.31879711151123, "rewards/tag_count_reward/mean": 0.7734375, "rewards/tag_count_reward/std": 0.3294980227947235, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 544.453125, "completions/mean_terminated_length": 216.34210205078125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.9125799573560768, "grad_norm": 20.602861404418945, "kl": 22.625, "learning_rate": 1.9588801644038e-05, "loss": 1.4384, "num_tokens": 17225029.0, "reward": -4.1180419921875, "reward_std": 2.680501699447632, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.5, "rewards/ppl_reward/mean": -10.376708984375, "rewards/ppl_reward/std": 7.675899982452393, "rewards/tag_count_reward/mean": 0.6328125, "rewards/tag_count_reward/std": 0.3830971121788025, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 505.234375, "completions/mean_terminated_length": 172.6923065185547, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.9137983551629607, "grad_norm": 18.214689254760742, "kl": 10.296875, "learning_rate": 1.958638185647831e-05, "loss": 1.1246, "num_tokens": 17263956.0, "reward": -3.757568359375, "reward_std": 3.244670867919922, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5039526224136353, "rewards/ppl_reward/mean": -9.83544921875, "rewards/ppl_reward/std": 10.035635948181152, "rewards/tag_count_reward/mean": 0.66015625, "rewards/tag_count_reward/std": 0.362878680229187, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 1024.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 624.359375, "completions/mean_terminated_length": 142.03448486328125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.9150167529698446, "grad_norm": 2.7079269886016846, "kl": 10.015625, "learning_rate": 1.958395512024824e-05, "loss": 0.8179, "num_tokens": 17310491.0, "reward": -7.349609375, "reward_std": 4.7702436447143555, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.49776285886764526, "rewards/ppl_reward/mean": -16.83984375, "rewards/ppl_reward/std": 15.586542129516602, "rewards/tag_count_reward/mean": 0.6484375, "rewards/tag_count_reward/std": 0.3527633249759674, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 479.328125, "completions/mean_terminated_length": 152.52500915527344, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.9162351507767286, "grad_norm": 19.440540313720703, "kl": 8.9140625, "learning_rate": 1.9581521437106795e-05, "loss": 1.0553, "num_tokens": 17347856.0, "reward": -3.46185302734375, "reward_std": 2.7021894454956055, "rewards/format_reward/mean": 0.578125, "rewards/format_reward/std": 0.49776285886764526, "rewards/ppl_reward/mean": -9.4393310546875, "rewards/ppl_reward/std": 11.959878921508789, "rewards/tag_count_reward/mean": 0.6796875, "rewards/tag_count_reward/std": 0.3818001449108124, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 509.96875, "completions/mean_terminated_length": 180.4615478515625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.9174535485836125, "grad_norm": 7.0202717781066895, "kl": 12.328125, "learning_rate": 1.9579080808818035e-05, "loss": 1.1289, "num_tokens": 17387174.0, "reward": -2.6998291015625, "reward_std": 2.427626371383667, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5029674172401428, "rewards/ppl_reward/mean": -7.821533203125, "rewards/ppl_reward/std": 4.120260238647461, "rewards/tag_count_reward/mean": 0.6796875, "rewards/tag_count_reward/std": 0.36587780714035034, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 357.0625, "completions/mean_terminated_length": 170.3199920654297, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9186719463904965, "grad_norm": 2.583071231842041, "kl": 14.09375, "learning_rate": 1.9576633237151033e-05, "loss": 1.1405, "num_tokens": 17416370.0, "reward": -3.0452880859375, "reward_std": 2.46195650100708, "rewards/format_reward/mean": 0.640625, "rewards/format_reward/std": 0.4836103618144989, "rewards/ppl_reward/mean": -8.879638671875, "rewards/ppl_reward/std": 9.565666198730469, "rewards/tag_count_reward/mean": 0.75390625, "rewards/tag_count_reward/std": 0.3549317717552185, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 281.8125, "completions/mean_terminated_length": 160.36363220214844, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.9198903441973805, "grad_norm": 4.94166898727417, "kl": 16.125, "learning_rate": 1.9574178723879913e-05, "loss": 1.2194, "num_tokens": 17441334.0, "reward": -1.35125732421875, "reward_std": 1.5127583742141724, "rewards/format_reward/mean": 0.703125, "rewards/format_reward/std": 0.4604927599430084, "rewards/ppl_reward/mean": -5.7493896484375, "rewards/ppl_reward/std": 3.237226724624634, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.32874447107315063, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 199.890625, "completions/mean_terminated_length": 144.95001220703125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.9211087420042644, "grad_norm": 8.860706329345703, "kl": 13.3671875, "learning_rate": 1.9571717270783827e-05, "loss": 0.8371, "num_tokens": 17461231.0, "reward": -0.9161376953125, "reward_std": 0.9615941047668457, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.168212890625, "rewards/ppl_reward/std": 2.112819194793701, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.27433067560195923, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 146.140625, "completions/mean_terminated_length": 146.140625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.9223271398111483, "grad_norm": 2.1710152626037598, "kl": 3.212890625, "learning_rate": 1.9569248879646948e-05, "loss": 0.1223, "num_tokens": 17477776.0, "reward": -1.1336669921875, "reward_std": 1.341883659362793, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.876708984375, "rewards/ppl_reward/std": 3.8201332092285156, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.23345555365085602, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 161.453125, "completions/mean_terminated_length": 147.7619171142578, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.9235455376180323, "grad_norm": 2.3246281147003174, "kl": 6.2685546875, "learning_rate": 1.9566773552258492e-05, "loss": 0.4539, "num_tokens": 17495253.0, "reward": -2.4044189453125, "reward_std": 1.1121827363967896, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.551025390625, "rewards/ppl_reward/std": 6.2483296394348145, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 154.703125, "completions/mean_terminated_length": 154.703125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.9247639354249162, "grad_norm": 2.3879361152648926, "kl": 4.1884765625, "learning_rate": 1.956429129041269e-05, "loss": 0.2149, "num_tokens": 17512202.0, "reward": -1.3740234375, "reward_std": 0.9116184115409851, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.255859375, "rewards/ppl_reward/std": 3.161891222000122, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.25341787934303284, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 143.265625, "completions/mean_terminated_length": 143.265625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.9259823332318002, "grad_norm": 1.810645580291748, "kl": 0.92578125, "learning_rate": 1.9561802095908804e-05, "loss": 0.0349, "num_tokens": 17528451.0, "reward": -0.91094970703125, "reward_std": 0.546454131603241, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.6422119140625, "rewards/ppl_reward/std": 4.530053615570068, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 129.515625, "completions/mean_terminated_length": 129.515625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.9272007310386842, "grad_norm": 1.5611170530319214, "kl": 0.8427734375, "learning_rate": 1.9559305970551125e-05, "loss": -0.087, "num_tokens": 17544244.0, "reward": -1.360595703125, "reward_std": 0.34239548444747925, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.59619140625, "rewards/ppl_reward/std": 4.897004127502441, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 149.75, "completions/mean_terminated_length": 135.87301635742188, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.9284191288455681, "grad_norm": 2.5467448234558105, "kl": 5.61328125, "learning_rate": 1.9556802916148963e-05, "loss": 0.271, "num_tokens": 17560860.0, "reward": -2.601318359375, "reward_std": 1.0670286417007446, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.82763671875, "rewards/ppl_reward/std": 7.884378433227539, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18898223340511322, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 160.015625, "completions/mean_terminated_length": 146.3015899658203, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.929637526652452, "grad_norm": 2.4142673015594482, "kl": 7.2998046875, "learning_rate": 1.9554292934516653e-05, "loss": 0.5739, "num_tokens": 17578261.0, "reward": -0.2877197265625, "reward_std": 0.7330451011657715, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.161376953125, "rewards/ppl_reward/std": 1.7730953693389893, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18496133387088776, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 218.125, "completions/mean_terminated_length": 149.83050537109375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.930855924459336, "grad_norm": 17.22871208190918, "kl": 20.21484375, "learning_rate": 1.9551776027473547e-05, "loss": 1.1365, "num_tokens": 17600133.0, "reward": -1.6842041015625, "reward_std": 1.3339197635650635, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.688720703125, "rewards/ppl_reward/std": 5.168800354003906, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.24368895590305328, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 222.84375, "completions/mean_terminated_length": 169.433349609375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.9320743222662199, "grad_norm": 12.820183753967285, "kl": 16.7421875, "learning_rate": 1.9549252196844028e-05, "loss": 0.8867, "num_tokens": 17622443.0, "reward": -3.7669677734375, "reward_std": 2.4098401069641113, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4787135720252991, "rewards/ppl_reward/mean": -10.447998046875, "rewards/ppl_reward/std": 11.981513977050781, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.2858421504497528, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 116.6875, "completions/mean_terminated_length": 116.6875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.9332927200731038, "grad_norm": 8.124235153198242, "kl": 5.41015625, "learning_rate": 1.9546721444457484e-05, "loss": 0.1053, "num_tokens": 17636327.0, "reward": -4.1978759765625, "reward_std": 4.363507270812988, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.467176616191864, "rewards/ppl_reward/mean": -11.356689453125, "rewards/ppl_reward/std": 17.70295524597168, "rewards/tag_count_reward/mean": 0.79296875, "rewards/tag_count_reward/std": 0.27297118306159973, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 156.96875, "completions/mean_terminated_length": 143.20635986328125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9345111178799879, "grad_norm": 4.889418601989746, "kl": 5.703125, "learning_rate": 1.9544183772148325e-05, "loss": 0.116, "num_tokens": 17653493.0, "reward": -1.5316162109375, "reward_std": 1.2730084657669067, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.4732423722743988, "rewards/ppl_reward/mean": -6.039794921875, "rewards/ppl_reward/std": 3.443310022354126, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.28256967663764954, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 175.71875, "completions/mean_terminated_length": 162.2539825439453, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.9357295156868718, "grad_norm": 10.379880905151367, "kl": 5.1796875, "learning_rate": 1.954163918175598e-05, "loss": 0.3905, "num_tokens": 17672147.0, "reward": -1.84228515625, "reward_std": 1.3003329038619995, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -7.0048828125, "rewards/ppl_reward/std": 4.778304576873779, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2439432591199875, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 158.9375, "completions/mean_terminated_length": 145.20635986328125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.9369479134937557, "grad_norm": 10.254528999328613, "kl": 1.94140625, "learning_rate": 1.9539087675124892e-05, "loss": 0.3401, "num_tokens": 17689151.0, "reward": -1.23211669921875, "reward_std": 0.7091070413589478, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.1517333984375, "rewards/ppl_reward/std": 3.697216272354126, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1836577206850052, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 138.90625, "completions/mean_terminated_length": 138.90625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.9381663113006397, "grad_norm": 6.941137313842773, "kl": 1.9619140625, "learning_rate": 1.953652925410451e-05, "loss": 0.1475, "num_tokens": 17704393.0, "reward": -1.201171875, "reward_std": 0.46502354741096497, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.16015625, "rewards/ppl_reward/std": 4.516382694244385, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 155.5625, "completions/mean_terminated_length": 155.5625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.9393847091075236, "grad_norm": 1.6504898071289062, "kl": 0.896484375, "learning_rate": 1.9533963920549307e-05, "loss": 0.0336, "num_tokens": 17721493.0, "reward": -0.9810791015625, "reward_std": 0.47490400075912476, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.829345703125, "rewards/ppl_reward/std": 2.440446376800537, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 123.796875, "completions/mean_terminated_length": 123.796875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.9406031069144075, "grad_norm": 4.222178936004639, "kl": 1.294921875, "learning_rate": 1.953139167631876e-05, "loss": -0.1435, "num_tokens": 17735376.0, "reward": -3.279052734375, "reward_std": 1.521687626838684, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -10.23779296875, "rewards/ppl_reward/std": 8.282392501831055, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 165.421875, "completions/mean_terminated_length": 165.421875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.9418215047212914, "grad_norm": 2.041886568069458, "kl": 1.4931640625, "learning_rate": 1.952881252327735e-05, "loss": 0.128, "num_tokens": 17752483.0, "reward": -2.2119140625, "reward_std": 1.5108152627944946, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.205078125, "rewards/ppl_reward/std": 8.08755111694336, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 222.609375, "completions/mean_terminated_length": 222.609375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.9430399025281755, "grad_norm": 2.000763416290283, "kl": 2.271484375, "learning_rate": 1.952622646329457e-05, "loss": 0.0078, "num_tokens": 17773994.0, "reward": -1.485107421875, "reward_std": 0.7599893808364868, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.31396484375, "rewards/ppl_reward/std": 3.035565137863159, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2314550280570984, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 276.65625, "completions/mean_terminated_length": 239.90162658691406, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.9442583003350594, "grad_norm": 7.009562969207764, "kl": 11.375, "learning_rate": 1.9523633498244926e-05, "loss": 0.5456, "num_tokens": 17799332.0, "reward": -2.86962890625, "reward_std": 2.2190001010894775, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.5, "rewards/ppl_reward/mean": -7.9970703125, "rewards/ppl_reward/std": 8.521260261535645, "rewards/tag_count_reward/mean": 0.69140625, "rewards/tag_count_reward/std": 0.3077768087387085, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 464.1875, "completions/mean_terminated_length": 277.5833435058594, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.9454766981419434, "grad_norm": 10.770435333251953, "kl": 17.15625, "learning_rate": 1.952103363000793e-05, "loss": 0.9457, "num_tokens": 17835240.0, "reward": -3.301025390625, "reward_std": 2.567840099334717, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4787135720252991, "rewards/ppl_reward/mean": -8.54736328125, "rewards/ppl_reward/std": 5.2432861328125, "rewards/tag_count_reward/mean": 0.62890625, "rewards/tag_count_reward/std": 0.33627331256866455, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1024.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 404.28125, "completions/mean_terminated_length": 275.6603698730469, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9466950959488273, "grad_norm": 17.93014144897461, "kl": 18.21875, "learning_rate": 1.9518426860468076e-05, "loss": 0.696, "num_tokens": 17867674.0, "reward": -3.8702392578125, "reward_std": 2.2456214427948, "rewards/format_reward/mean": 0.328125, "rewards/format_reward/std": 0.4732423722743988, "rewards/ppl_reward/mean": -9.576416015625, "rewards/ppl_reward/std": 8.171914100646973, "rewards/tag_count_reward/mean": 0.58984375, "rewards/tag_count_reward/std": 0.3573691248893738, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 379.046875, "completions/mean_terminated_length": 347.3278503417969, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.9479134937557112, "grad_norm": 6.1082892417907715, "kl": 10.453125, "learning_rate": 1.9515813191514884e-05, "loss": 0.3936, "num_tokens": 17898997.0, "reward": -2.336181640625, "reward_std": 2.3556625843048096, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.49776285886764526, "rewards/ppl_reward/mean": -6.75048828125, "rewards/ppl_reward/std": 6.039872169494629, "rewards/tag_count_reward/mean": 0.6171875, "rewards/tag_count_reward/std": 0.34494292736053467, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 535.671875, "completions/mean_terminated_length": 485.1551818847656, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.9491318915625951, "grad_norm": 3.590294122695923, "kl": 3.578125, "learning_rate": 1.9513192625042867e-05, "loss": 0.087, "num_tokens": 17940944.0, "reward": -1.747314453125, "reward_std": 1.8008685111999512, "rewards/format_reward/mean": 0.546875, "rewards/format_reward/std": 0.501733124256134, "rewards/ppl_reward/mean": -5.97119140625, "rewards/ppl_reward/std": 3.0612967014312744, "rewards/tag_count_reward/mean": 0.69140625, "rewards/tag_count_reward/std": 0.3234921991825104, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 647.90625, "completions/mean_terminated_length": 464.2325439453125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.9503502893694792, "grad_norm": 1.2359205484390259, "kl": 1.236328125, "learning_rate": 1.9510565162951538e-05, "loss": -0.0555, "num_tokens": 17989938.0, "reward": -1.65234375, "reward_std": 1.0305514335632324, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.5, "rewards/ppl_reward/mean": -5.6953125, "rewards/ppl_reward/std": 2.534048557281494, "rewards/tag_count_reward/mean": 0.7578125, "rewards/tag_count_reward/std": 0.2987033724784851, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 533.71875, "completions/mean_terminated_length": 453.49090576171875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.9515686871763631, "grad_norm": 0.8135776519775391, "kl": 1.091796875, "learning_rate": 1.9507930807145406e-05, "loss": 0.0458, "num_tokens": 18031336.0, "reward": -1.572265625, "reward_std": 0.7546786069869995, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -6.32421875, "rewards/ppl_reward/std": 2.9475014209747314, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.2287265807390213, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 352.328125, "completions/mean_terminated_length": 341.66668701171875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.952787084983247, "grad_norm": 2.698134183883667, "kl": 1.3173828125, "learning_rate": 1.9505289559533977e-05, "loss": -0.051, "num_tokens": 18061389.0, "reward": -3.3653564453125, "reward_std": 1.4471765756607056, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -9.996337890625, "rewards/ppl_reward/std": 8.167729377746582, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.2567298710346222, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 238.109375, "completions/mean_terminated_length": 238.109375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.954005482790131, "grad_norm": 3.793919324874878, "kl": 1.0458984375, "learning_rate": 1.950264142203176e-05, "loss": 0.1273, "num_tokens": 18083804.0, "reward": -0.819091796875, "reward_std": 0.4718816876411438, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.45068359375, "rewards/ppl_reward/std": 1.8984178304672241, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.10796641558408737, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 195.34375, "completions/mean_terminated_length": 195.34375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.9552238805970149, "grad_norm": 1.9505486488342285, "kl": 2.583984375, "learning_rate": 1.949998639655825e-05, "loss": 0.0723, "num_tokens": 18104138.0, "reward": -0.82861328125, "reward_std": 0.9269078969955444, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.9306640625, "rewards/ppl_reward/std": 3.187124013900757, "rewards/tag_count_reward/mean": 0.82421875, "rewards/tag_count_reward/std": 0.2841015160083771, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 150.59375, "completions/mean_terminated_length": 150.59375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.9564422784038988, "grad_norm": 2.354107141494751, "kl": 3.505859375, "learning_rate": 1.9497324485037933e-05, "loss": 0.0462, "num_tokens": 18120400.0, "reward": -6.608154296875, "reward_std": 5.033623218536377, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -16.51318359375, "rewards/ppl_reward/std": 20.813390731811523, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.25865477323532104, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 179.203125, "completions/mean_terminated_length": 179.203125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9576606762107828, "grad_norm": 1.8983628749847412, "kl": 2.71484375, "learning_rate": 1.9494655689400294e-05, "loss": 0.0096, "num_tokens": 18139053.0, "reward": -13.4310302734375, "reward_std": 3.0828561782836914, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -30.174560546875, "rewards/ppl_reward/std": 67.31814575195312, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.22271771728992462, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 155.65625, "completions/mean_terminated_length": 155.65625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.9588790740176668, "grad_norm": 2.5287301540374756, "kl": 5.6171875, "learning_rate": 1.9491980011579805e-05, "loss": 0.1738, "num_tokens": 18156839.0, "reward": -1.39794921875, "reward_std": 1.0459785461425781, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -5.9990234375, "rewards/ppl_reward/std": 3.602083206176758, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.2634054720401764, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 163.640625, "completions/mean_terminated_length": 163.640625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.9600974718245507, "grad_norm": 3.1296024322509766, "kl": 3.7265625, "learning_rate": 1.9489297453515926e-05, "loss": 0.1196, "num_tokens": 18174816.0, "reward": -1.2047119140625, "reward_std": 0.9161862730979919, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.800048828125, "rewards/ppl_reward/std": 3.580885887145996, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.20518454909324646, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 148.609375, "completions/mean_terminated_length": 148.609375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.9613158696314347, "grad_norm": 2.5774176120758057, "kl": 6.3671875, "learning_rate": 1.94866080171531e-05, "loss": 0.2913, "num_tokens": 18190959.0, "reward": -5.911376953125, "reward_std": 3.529604434967041, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -15.09619140625, "rewards/ppl_reward/std": 11.721986770629883, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.2781464755535126, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 140.59375, "completions/mean_terminated_length": 140.59375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.9625342674383186, "grad_norm": 2.0070085525512695, "kl": 3.654296875, "learning_rate": 1.9483911704440766e-05, "loss": 0.1341, "num_tokens": 18206725.0, "reward": -0.92822265625, "reward_std": 1.2949533462524414, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.3642578125, "rewards/ppl_reward/std": 3.417689800262451, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1925172060728073, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 143.8125, "completions/mean_terminated_length": 143.8125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.9637526652452025, "grad_norm": 2.5407235622406006, "kl": 4.734375, "learning_rate": 1.9481208517333336e-05, "loss": 0.2005, "num_tokens": 18222657.0, "reward": -4.30859375, "reward_std": 1.5582897663116455, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -12.0625, "rewards/ppl_reward/std": 6.977176666259766, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2025614231824875, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 146.328125, "completions/mean_terminated_length": 146.328125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.9649710630520865, "grad_norm": 2.9014155864715576, "kl": 5.4287109375, "learning_rate": 1.9478498457790213e-05, "loss": 0.1219, "num_tokens": 18239406.0, "reward": -1.1771240234375, "reward_std": 0.8188413381576538, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -5.651123046875, "rewards/ppl_reward/std": 2.3993215560913086, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2708333432674408, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 132.890625, "completions/mean_terminated_length": 132.890625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9661894608589705, "grad_norm": 2.526848554611206, "kl": 4.6376953125, "learning_rate": 1.9475781527775776e-05, "loss": 0.1929, "num_tokens": 18254975.0, "reward": -2.1826171875, "reward_std": 1.1298048496246338, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.818359375, "rewards/ppl_reward/std": 4.956033706665039, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.23239074647426605, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 136.28125, "completions/mean_terminated_length": 136.28125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9674078586658544, "grad_norm": 4.123267650604248, "kl": 4.021484375, "learning_rate": 1.9473057729259386e-05, "loss": 0.1904, "num_tokens": 18270809.0, "reward": -2.206787109375, "reward_std": 1.4258146286010742, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -7.75732421875, "rewards/ppl_reward/std": 5.184847831726074, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2630521357059479, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 127.171875, "completions/mean_terminated_length": 127.171875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.9686262564727384, "grad_norm": 2.105017900466919, "kl": 2.287109375, "learning_rate": 1.9470327064215383e-05, "loss": -0.0159, "num_tokens": 18285228.0, "reward": -3.49658203125, "reward_std": 1.9031636714935303, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -10.3525390625, "rewards/ppl_reward/std": 6.093019962310791, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2662152051925659, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 116.84375, "completions/mean_terminated_length": 116.84375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9698446542796223, "grad_norm": 3.340337038040161, "kl": 4.1953125, "learning_rate": 1.946758953462309e-05, "loss": 0.0191, "num_tokens": 18299106.0, "reward": -2.1173095703125, "reward_std": 1.9194285869598389, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.4732423722743988, "rewards/ppl_reward/mean": -7.148681640625, "rewards/ppl_reward/std": 5.4216437339782715, "rewards/tag_count_reward/mean": 0.78515625, "rewards/tag_count_reward/std": 0.35036033391952515, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 120.90625, "completions/mean_terminated_length": 120.90625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.9710630520865062, "grad_norm": 2.5213613510131836, "kl": 3.66015625, "learning_rate": 1.9464845142466795e-05, "loss": -0.031, "num_tokens": 18313604.0, "reward": -1.9208984375, "reward_std": 1.2971992492675781, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.48795005679130554, "rewards/ppl_reward/mean": -6.599609375, "rewards/ppl_reward/std": 4.419480323791504, "rewards/tag_count_reward/mean": 0.75390625, "rewards/tag_count_reward/std": 0.3766292631626129, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 124.203125, "completions/mean_terminated_length": 124.203125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.9722814498933902, "grad_norm": 2.1900877952575684, "kl": 1.72265625, "learning_rate": 1.9462093889735766e-05, "loss": -0.0089, "num_tokens": 18328529.0, "reward": -0.9969482421875, "reward_std": 0.8743548393249512, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.493896484375, "rewards/ppl_reward/std": 3.6735172271728516, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.2459997534751892, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 130.5, "completions/mean_terminated_length": 130.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.9734998477002741, "grad_norm": 2.898362636566162, "kl": 1.98046875, "learning_rate": 1.9459335778424245e-05, "loss": -0.0558, "num_tokens": 18343665.0, "reward": -1.4149169921875, "reward_std": 1.1813733577728271, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.251708984375, "rewards/ppl_reward/std": 3.616881847381592, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2662152051925659, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 126.5625, "completions/mean_terminated_length": 126.5625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.9747182455071581, "grad_norm": 1.8738309144973755, "kl": 1.7822265625, "learning_rate": 1.9456570810531442e-05, "loss": 0.1107, "num_tokens": 18358653.0, "reward": -1.973876953125, "reward_std": 0.8656952381134033, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.60400390625, "rewards/ppl_reward/std": 3.685035228729248, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.22271771728992462, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 119.171875, "completions/mean_terminated_length": 119.171875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.9759366433140421, "grad_norm": 2.0393195152282715, "kl": 2.0185546875, "learning_rate": 1.9453798988061535e-05, "loss": 0.0147, "num_tokens": 18372736.0, "reward": -2.590087890625, "reward_std": 1.5259939432144165, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.82080078125, "rewards/ppl_reward/std": 5.268942832946777, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.22479599714279175, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 121.609375, "completions/mean_terminated_length": 121.609375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.977155041120926, "grad_norm": 2.5325350761413574, "kl": 1.9599609375, "learning_rate": 1.945102031302368e-05, "loss": 0.0258, "num_tokens": 18388167.0, "reward": -2.1368408203125, "reward_std": 1.0711183547973633, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.812744140625, "rewards/ppl_reward/std": 4.551942348480225, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2948119342327118, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 108.90625, "completions/mean_terminated_length": 108.90625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.9783734389278099, "grad_norm": 3.9484899044036865, "kl": 3.873046875, "learning_rate": 1.9448234787431993e-05, "loss": 0.1849, "num_tokens": 18401833.0, "reward": -1.18115234375, "reward_std": 2.2902495861053467, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.6982421875, "rewards/ppl_reward/std": 4.826076030731201, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.3656023144721985, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 117.375, "completions/mean_terminated_length": 117.375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.9795918367346939, "grad_norm": 5.195118427276611, "kl": 5.29296875, "learning_rate": 1.9445442413305556e-05, "loss": 0.1531, "num_tokens": 18416505.0, "reward": -4.2352294921875, "reward_std": 1.669076919555664, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -11.954833984375, "rewards/ppl_reward/std": 16.23495864868164, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.31487196683883667, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 121.40625, "completions/mean_terminated_length": 121.40625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.9808102345415778, "grad_norm": 3.957515239715576, "kl": 5.8046875, "learning_rate": 1.9442643192668418e-05, "loss": 0.2142, "num_tokens": 18431283.0, "reward": -3.22283935546875, "reward_std": 1.4610158205032349, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -10.0159912109375, "rewards/ppl_reward/std": 15.706392288208008, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.25730282068252563, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 118.140625, "completions/mean_terminated_length": 118.140625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.9820286323484618, "grad_norm": 4.870558261871338, "kl": 8.595703125, "learning_rate": 1.9439837127549587e-05, "loss": 0.4184, "num_tokens": 18445708.0, "reward": -1.51806640625, "reward_std": 1.5925538539886475, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -6.1845703125, "rewards/ppl_reward/std": 3.529383659362793, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.31435462832450867, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 128.734375, "completions/mean_terminated_length": 114.52381896972656, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.9832470301553458, "grad_norm": 6.783117294311523, "kl": 13.06640625, "learning_rate": 1.943702421998303e-05, "loss": 0.7553, "num_tokens": 18460363.0, "reward": -1.84619140625, "reward_std": 1.606164574623108, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -6.9658203125, "rewards/ppl_reward/std": 5.1481032371521, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.27455660700798035, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 107.375, "completions/mean_terminated_length": 107.375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.9844654279622297, "grad_norm": 6.429581165313721, "kl": 6.84765625, "learning_rate": 1.9434204472007682e-05, "loss": 0.1672, "num_tokens": 18474043.0, "reward": -1.624267578125, "reward_std": 2.520540237426758, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.61572265625, "rewards/ppl_reward/std": 7.886785507202148, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.30534976720809937, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 110.453125, "completions/mean_terminated_length": 110.453125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.9856838257691136, "grad_norm": 2.6024045944213867, "kl": 4.0078125, "learning_rate": 1.9431377885667433e-05, "loss": 0.0874, "num_tokens": 18487320.0, "reward": -3.03125, "reward_std": 2.296271800994873, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -9.4921875, "rewards/ppl_reward/std": 9.983071327209473, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.25439465045928955, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 134.4375, "completions/mean_terminated_length": 134.4375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.9869022235759976, "grad_norm": 1.944289207458496, "kl": 2.1953125, "learning_rate": 1.9428544463011125e-05, "loss": 0.0283, "num_tokens": 18503524.0, "reward": -2.387451171875, "reward_std": 1.390687108039856, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.46240234375, "rewards/ppl_reward/std": 9.943888664245605, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14689241349697113, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 123.296875, "completions/mean_terminated_length": 123.296875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.9881206213828815, "grad_norm": 2.658351182937622, "kl": 3.095703125, "learning_rate": 1.9425704206092562e-05, "loss": 0.1028, "num_tokens": 18519543.0, "reward": -0.713134765625, "reward_std": 0.7361874580383301, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.88720703125, "rewards/ppl_reward/std": 2.2756807804107666, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.1994769275188446, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 132.78125, "completions/mean_terminated_length": 132.78125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9893390191897654, "grad_norm": 3.1819379329681396, "kl": 5.458984375, "learning_rate": 1.9422857116970495e-05, "loss": 0.1489, "num_tokens": 18535465.0, "reward": -0.5416259765625, "reward_std": 0.9666951298713684, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -4.294189453125, "rewards/ppl_reward/std": 1.5186189413070679, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.28102946281433105, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 123.21875, "completions/mean_terminated_length": 123.21875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9905574169966495, "grad_norm": 3.4418838024139404, "kl": 2.99609375, "learning_rate": 1.942000319770864e-05, "loss": 0.0954, "num_tokens": 18550015.0, "reward": -2.0067138671875, "reward_std": 1.084360122680664, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -7.396240234375, "rewards/ppl_reward/std": 3.614423990249634, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.24138814210891724, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 151.90625, "completions/mean_terminated_length": 151.90625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.9917758148035334, "grad_norm": 3.3118226528167725, "kl": 7.1640625, "learning_rate": 1.9417142450375643e-05, "loss": 0.2701, "num_tokens": 18567753.0, "reward": -0.625, "reward_std": 1.2287838459014893, "rewards/format_reward/mean": 0.640625, "rewards/format_reward/std": 0.4836103618144989, "rewards/ppl_reward/mean": -4.109375, "rewards/ppl_reward/std": 1.6725807189941406, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.28942593932151794, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 164.0, "completions/mean_terminated_length": 164.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.9929942126104173, "grad_norm": 4.247884273529053, "kl": 8.34375, "learning_rate": 1.9414274877045117e-05, "loss": 0.3173, "num_tokens": 18586497.0, "reward": -2.2388916015625, "reward_std": 1.5611114501953125, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.48795005679130554, "rewards/ppl_reward/mean": -7.274658203125, "rewards/ppl_reward/std": 4.839175701141357, "rewards/tag_count_reward/mean": 0.7734375, "rewards/tag_count_reward/std": 0.3203382194042206, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 136.03125, "completions/mean_terminated_length": 136.03125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.9942126104173012, "grad_norm": 2.8398797512054443, "kl": 5.4296875, "learning_rate": 1.9411400479795618e-05, "loss": 0.1381, "num_tokens": 18601939.0, "reward": -5.026611328125, "reward_std": 4.677097797393799, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -13.25634765625, "rewards/ppl_reward/std": 24.679920196533203, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.2744719088077545, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 143.640625, "completions/mean_terminated_length": 143.640625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.9954310082241852, "grad_norm": 2.8487539291381836, "kl": 4.91015625, "learning_rate": 1.9408519260710643e-05, "loss": 0.2336, "num_tokens": 18617740.0, "reward": -1.71063232421875, "reward_std": 1.4991681575775146, "rewards/format_reward/mean": 0.640625, "rewards/format_reward/std": 0.4836103618144989, "rewards/ppl_reward/mean": -6.3587646484375, "rewards/ppl_reward/std": 3.5653438568115234, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.27048972249031067, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 147.375, "completions/mean_terminated_length": 147.375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.9966494060310691, "grad_norm": 6.363589763641357, "kl": 9.1953125, "learning_rate": 1.9405631221878645e-05, "loss": 0.3288, "num_tokens": 18634468.0, "reward": -6.6566162109375, "reward_std": 3.6796820163726807, "rewards/format_reward/mean": 0.59375, "rewards/format_reward/std": 0.49501484632492065, "rewards/ppl_reward/mean": -16.039794921875, "rewards/ppl_reward/std": 22.09617042541504, "rewards/tag_count_reward/mean": 0.76953125, "rewards/tag_count_reward/std": 0.2897203266620636, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 138.203125, "completions/mean_terminated_length": 138.203125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.997867803837953, "grad_norm": 2.2997164726257324, "kl": 3.23828125, "learning_rate": 1.940273636539301e-05, "loss": 0.0318, "num_tokens": 18649793.0, "reward": -2.41748046875, "reward_std": 2.5729482173919678, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -8.2021484375, "rewards/ppl_reward/std": 10.278287887573242, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.20225508511066437, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 144.43243408203125, "completions/mean_terminated_length": 144.43243408203125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.9990862016448371, "grad_norm": 3.0302772521972656, "kl": 1.73046875, "learning_rate": 1.9399834693352063e-05, "loss": 0.0565, "num_tokens": 18667312.0, "reward": -2.204833984375, "reward_std": 0.8999186754226685, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.02685546875, "rewards/ppl_reward/std": 6.053974151611328, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17951758205890656, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 138.390625, "completions/mean_terminated_length": 138.390625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.001218397806884, "grad_norm": 1.693712830543518, "kl": 2.88671875, "learning_rate": 1.9396926207859085e-05, "loss": 0.0567, "num_tokens": 18683161.0, "reward": -1.415771484375, "reward_std": 0.8247172832489014, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.37841796875, "rewards/ppl_reward/std": 3.283198833465576, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21114179491996765, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 155.9375, "completions/mean_terminated_length": 155.9375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.0024367956137679, "grad_norm": 3.518299102783203, "kl": 1.884765625, "learning_rate": 1.9394010911022283e-05, "loss": 0.132, "num_tokens": 18700029.0, "reward": -1.0389404296875, "reward_std": 0.7037788033485413, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.734130859375, "rewards/ppl_reward/std": 2.8681559562683105, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 170.671875, "completions/mean_terminated_length": 170.671875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.003655193420652, "grad_norm": 2.3824429512023926, "kl": 2.5244140625, "learning_rate": 1.93910888049548e-05, "loss": -0.0303, "num_tokens": 18717888.0, "reward": -2.243408203125, "reward_std": 1.049149513244629, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.93212890625, "rewards/ppl_reward/std": 4.243074893951416, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.23302355408668518, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 191.84375, "completions/mean_terminated_length": 191.84375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.0048735912275357, "grad_norm": 3.2836625576019287, "kl": 10.296875, "learning_rate": 1.9388159891774725e-05, "loss": 0.5475, "num_tokens": 18737150.0, "reward": -2.3922119140625, "reward_std": 2.476229190826416, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -7.870361328125, "rewards/ppl_reward/std": 7.498528957366943, "rewards/tag_count_reward/mean": 0.82421875, "rewards/tag_count_reward/std": 0.3107839822769165, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 212.59375, "completions/mean_terminated_length": 212.59375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 1.0060919890344198, "grad_norm": 6.2141852378845215, "kl": 13.22265625, "learning_rate": 1.9385224173605072e-05, "loss": 0.6827, "num_tokens": 18759164.0, "reward": -0.7991943359375, "reward_std": 0.8914183378219604, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -4.754638671875, "rewards/ppl_reward/std": 1.7526391744613647, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.2903883755207062, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 239.828125, "completions/mean_terminated_length": 214.53225708007812, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.0073103868413036, "grad_norm": 4.7331061363220215, "kl": 12.9921875, "learning_rate": 1.9382281652573787e-05, "loss": 0.7924, "num_tokens": 18781745.0, "reward": -2.77587890625, "reward_std": 1.8485107421875, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.4732423722743988, "rewards/ppl_reward/mean": -8.5283203125, "rewards/ppl_reward/std": 5.816132545471191, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.2895062565803528, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 281.03125, "completions/mean_terminated_length": 281.03125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.0085287846481876, "grad_norm": 3.1912295818328857, "kl": 12.34375, "learning_rate": 1.937933233081376e-05, "loss": 0.7618, "num_tokens": 18807171.0, "reward": -2.1087646484375, "reward_std": 1.342488408088684, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -7.569091796875, "rewards/ppl_reward/std": 6.758418560028076, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2519455552101135, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 219.03125, "completions/mean_terminated_length": 219.03125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 1.0097471824550717, "grad_norm": 1.4562206268310547, "kl": 4.45703125, "learning_rate": 1.9376376210462792e-05, "loss": 0.115, "num_tokens": 18828869.0, "reward": -1.0784912109375, "reward_std": 0.7533445358276367, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.531982421875, "rewards/ppl_reward/std": 4.853420734405518, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.24193336069583893, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 233.390625, "completions/mean_terminated_length": 233.390625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 1.0109655802619555, "grad_norm": 1.6523137092590332, "kl": 6.890625, "learning_rate": 1.9373413293663625e-05, "loss": 0.2683, "num_tokens": 18851046.0, "reward": -2.296630859375, "reward_std": 1.639784336090088, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -7.86669921875, "rewards/ppl_reward/std": 3.95312762260437, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.3403056263923645, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 260.5, "completions/mean_terminated_length": 260.5, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.0121839780688395, "grad_norm": 1.1952855587005615, "kl": 2.5693359375, "learning_rate": 1.9370443582563926e-05, "loss": -0.0307, "num_tokens": 18876454.0, "reward": -0.3505859375, "reward_std": 0.6567258834838867, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.294921875, "rewards/ppl_reward/std": 1.8862253427505493, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.23935678601264954, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 264.796875, "completions/mean_terminated_length": 264.796875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 1.0134023758757233, "grad_norm": 0.9719745516777039, "kl": 0.8212890625, "learning_rate": 1.936746707931628e-05, "loss": -0.0579, "num_tokens": 18900105.0, "reward": -1.6065673828125, "reward_std": 1.058040738105774, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.056884765625, "rewards/ppl_reward/std": 5.873520851135254, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 262.671875, "completions/mean_terminated_length": 262.671875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.0146207736826074, "grad_norm": 1.7606115341186523, "kl": 1.0634765625, "learning_rate": 1.9364483786078204e-05, "loss": 0.1046, "num_tokens": 18923604.0, "reward": -8.1514892578125, "reward_std": 4.519126892089844, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -20.076416015625, "rewards/ppl_reward/std": 43.96181869506836, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.20499558746814728, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 278.9375, "completions/mean_terminated_length": 278.9375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.0158391714894912, "grad_norm": 1.2379121780395508, "kl": 1.6708984375, "learning_rate": 1.9361493705012127e-05, "loss": -0.0969, "num_tokens": 18949008.0, "reward": -0.9833984375, "reward_std": 1.0886229276657104, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.646484375, "rewards/ppl_reward/std": 3.1930058002471924, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2237938940525055, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 271.640625, "completions/mean_terminated_length": 271.640625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 1.0170575692963753, "grad_norm": 2.9035215377807617, "kl": 1.123046875, "learning_rate": 1.9358496838285408e-05, "loss": 0.0563, "num_tokens": 18972905.0, "reward": -1.913330078125, "reward_std": 1.0979599952697754, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.52978515625, "rewards/ppl_reward/std": 4.9577531814575195, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15570341050624847, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 225.203125, "completions/mean_terminated_length": 225.203125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.0182759671032593, "grad_norm": 1.867233395576477, "kl": 3.93359375, "learning_rate": 1.9355493188070315e-05, "loss": -0.0666, "num_tokens": 18994462.0, "reward": -3.883056640625, "reward_std": 3.0451178550720215, "rewards/format_reward/mean": 0.609375, "rewards/format_reward/std": 0.4917473793029785, "rewards/ppl_reward/mean": -10.54736328125, "rewards/ppl_reward/std": 9.44600772857666, "rewards/tag_count_reward/mean": 0.78125, "rewards/tag_count_reward/std": 0.3196600377559662, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 168.796875, "completions/mean_terminated_length": 168.796875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.0194943649101431, "grad_norm": 3.646296977996826, "kl": 5.671875, "learning_rate": 1.9352482756544044e-05, "loss": 0.0735, "num_tokens": 19011969.0, "reward": -3.2918701171875, "reward_std": 3.414745807647705, "rewards/format_reward/mean": 0.515625, "rewards/format_reward/std": 0.5037065148353577, "rewards/ppl_reward/mean": -9.068115234375, "rewards/ppl_reward/std": 8.91739559173584, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.3354656398296356, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 157.953125, "completions/mean_terminated_length": 157.953125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 1.0207127627170272, "grad_norm": 3.7079741954803467, "kl": 5.36328125, "learning_rate": 1.93494655458887e-05, "loss": 0.0319, "num_tokens": 19028254.0, "reward": -7.044677734375, "reward_std": 7.889036655426025, "rewards/format_reward/mean": 0.59375, "rewards/format_reward/std": 0.49501484632492065, "rewards/ppl_reward/mean": -16.80029296875, "rewards/ppl_reward/std": 30.497745513916016, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.32253241539001465, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 170.015625, "completions/mean_terminated_length": 170.015625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 1.021931160523911, "grad_norm": 5.658080577850342, "kl": 5.17578125, "learning_rate": 1.9346441558291298e-05, "loss": -0.019, "num_tokens": 19046447.0, "reward": -2.68499755859375, "reward_std": 1.753563642501831, "rewards/format_reward/mean": 0.59375, "rewards/format_reward/std": 0.49501484632492065, "rewards/ppl_reward/mean": -8.1903076171875, "rewards/ppl_reward/std": 7.07589054107666, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.29291296005249023, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 204.3125, "completions/mean_terminated_length": 204.3125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.023149558330795, "grad_norm": 1.9836515188217163, "kl": 3.310546875, "learning_rate": 1.9343410795943768e-05, "loss": 0.0042, "num_tokens": 19066979.0, "reward": -1.11810302734375, "reward_std": 0.8226479291915894, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.7752685546875, "rewards/ppl_reward/std": 3.1331124305725098, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2025614231824875, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 179.96875, "completions/mean_terminated_length": 179.96875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 1.024367956137679, "grad_norm": 2.1883544921875, "kl": 3.7578125, "learning_rate": 1.9340373261042962e-05, "loss": -0.1164, "num_tokens": 19085033.0, "reward": -1.5660400390625, "reward_std": 2.081068515777588, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.467176616191864, "rewards/ppl_reward/mean": -6.132080078125, "rewards/ppl_reward/std": 4.454247951507568, "rewards/tag_count_reward/mean": 0.8125, "rewards/tag_count_reward/std": 0.3421454429626465, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 150.84375, "completions/mean_terminated_length": 150.84375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.0255863539445629, "grad_norm": 4.754009246826172, "kl": 6.4375, "learning_rate": 1.933732895579062e-05, "loss": 0.0937, "num_tokens": 19101047.0, "reward": -12.4091796875, "reward_std": 14.613725662231445, "rewards/format_reward/mean": 0.703125, "rewards/format_reward/std": 0.4604927599430084, "rewards/ppl_reward/mean": -27.865234375, "rewards/ppl_reward/std": 78.02706146240234, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.31644338369369507, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 164.1875, "completions/mean_terminated_length": 164.1875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.026804751751447, "grad_norm": 2.4984750747680664, "kl": 4.08203125, "learning_rate": 1.9334277882393414e-05, "loss": 0.0341, "num_tokens": 19117827.0, "reward": -4.20361328125, "reward_std": 2.0061259269714355, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -11.8369140625, "rewards/ppl_reward/std": 7.3858418464660645, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.1994769275188446, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 159.78125, "completions/mean_terminated_length": 159.78125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 1.0280231495583307, "grad_norm": 2.1371145248413086, "kl": 2.40234375, "learning_rate": 1.9331220043062894e-05, "loss": -0.1733, "num_tokens": 19134477.0, "reward": -3.8330078125, "reward_std": 1.3336608409881592, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -11.150390625, "rewards/ppl_reward/std": 11.433245658874512, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.12962667644023895, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 167.765625, "completions/mean_terminated_length": 167.765625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.0292415473652148, "grad_norm": 2.7651171684265137, "kl": 1.439453125, "learning_rate": 1.932815544001554e-05, "loss": -0.0885, "num_tokens": 19152334.0, "reward": -0.0126953125, "reward_std": 0.47970229387283325, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -3.697265625, "rewards/ppl_reward/std": 1.0860575437545776, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.10175786912441254, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 165.453125, "completions/mean_terminated_length": 165.453125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.0304599451720986, "grad_norm": 2.358419179916382, "kl": 1.189453125, "learning_rate": 1.9325084075472714e-05, "loss": -0.0158, "num_tokens": 19170675.0, "reward": -0.86181640625, "reward_std": 0.4563247561454773, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.6064453125, "rewards/ppl_reward/std": 4.0829644203186035, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 191.796875, "completions/mean_terminated_length": 191.796875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.0316783429789826, "grad_norm": 3.6887762546539307, "kl": 1.462890625, "learning_rate": 1.93220059516607e-05, "loss": 0.0795, "num_tokens": 19189774.0, "reward": -0.7764892578125, "reward_std": 0.4927675426006317, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.459228515625, "rewards/ppl_reward/std": 2.5631256103515625, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 171.8125, "completions/mean_terminated_length": 171.8125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.0328967407858667, "grad_norm": 5.900251865386963, "kl": 2.67578125, "learning_rate": 1.931892107081066e-05, "loss": 0.2504, "num_tokens": 19207794.0, "reward": -1.88330078125, "reward_std": 1.3540492057800293, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.2900390625, "rewards/ppl_reward/std": 4.8270745277404785, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.25439465045928955, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 155.6875, "completions/mean_terminated_length": 155.6875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.0341151385927505, "grad_norm": 4.548760890960693, "kl": 3.51953125, "learning_rate": 1.9315829435158672e-05, "loss": 0.2348, "num_tokens": 19224390.0, "reward": -3.86474609375, "reward_std": 3.219137191772461, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -11.1669921875, "rewards/ppl_reward/std": 12.429540634155273, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2812775671482086, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 139.796875, "completions/mean_terminated_length": 139.796875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.0353335363996345, "grad_norm": 2.9115772247314453, "kl": 5.578125, "learning_rate": 1.9312731046945694e-05, "loss": 0.3302, "num_tokens": 19239505.0, "reward": -47.7685546875, "reward_std": 78.00406646728516, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -98.982421875, "rewards/ppl_reward/std": 475.02996826171875, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2851906716823578, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 136.484375, "completions/mean_terminated_length": 136.484375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.0365519342065184, "grad_norm": 2.4534895420074463, "kl": 4.900390625, "learning_rate": 1.930962590841759e-05, "loss": 0.12, "num_tokens": 19254400.0, "reward": -7.41748046875, "reward_std": 3.2430224418640137, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -18.0927734375, "rewards/ppl_reward/std": 31.93817901611328, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.32310864329338074, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 127.953125, "completions/mean_terminated_length": 127.953125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 1.0377703320134024, "grad_norm": 7.933717727661133, "kl": 10.765625, "learning_rate": 1.930651402182512e-05, "loss": 0.4185, "num_tokens": 19269637.0, "reward": -3.310791015625, "reward_std": 1.9182275533676147, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -9.75439453125, "rewards/ppl_reward/std": 9.973933219909668, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.33994102478027344, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 136.796875, "completions/mean_terminated_length": 136.796875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 1.0389887298202862, "grad_norm": 4.26050329208374, "kl": 9.6875, "learning_rate": 1.9303395389423918e-05, "loss": 0.4854, "num_tokens": 19285744.0, "reward": -1.2047119140625, "reward_std": 1.347798466682434, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -5.589111328125, "rewards/ppl_reward/std": 3.1338648796081543, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.3129708766937256, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 129.859375, "completions/mean_terminated_length": 129.859375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 1.0402071276271703, "grad_norm": 3.5858922004699707, "kl": 6.7890625, "learning_rate": 1.9300270013474526e-05, "loss": 0.1583, "num_tokens": 19300711.0, "reward": -2.7493896484375, "reward_std": 3.725569725036621, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -8.717529296875, "rewards/ppl_reward/std": 12.561613082885742, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.3165413439273834, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 122.46875, "completions/mean_terminated_length": 122.46875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.0414255254340543, "grad_norm": 6.013157844543457, "kl": 7.20703125, "learning_rate": 1.9297137896242366e-05, "loss": 0.1672, "num_tokens": 19315733.0, "reward": -3.42694091796875, "reward_std": 3.749058961868286, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -10.0335693359375, "rewards/ppl_reward/std": 18.32717514038086, "rewards/tag_count_reward/mean": 0.82421875, "rewards/tag_count_reward/std": 0.3353501260280609, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 124.515625, "completions/mean_terminated_length": 124.515625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.0426439232409381, "grad_norm": 2.253941774368286, "kl": 3.86328125, "learning_rate": 1.9293999039997745e-05, "loss": 0.005, "num_tokens": 19330462.0, "reward": -1.4359130859375, "reward_std": 0.9597556591033936, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.426513671875, "rewards/ppl_reward/std": 2.711449146270752, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.23168931901454926, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 1.0438623210478222, "grad_norm": 2.786433696746826, "kl": 5.2265625, "learning_rate": 1.9290853447015858e-05, "loss": 0.0803, "num_tokens": 19345230.0, "reward": -1.700439453125, "reward_std": 1.3387293815612793, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.87744140625, "rewards/ppl_reward/std": 3.8173234462738037, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2649018466472626, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 136.828125, "completions/mean_terminated_length": 136.828125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 1.045080718854706, "grad_norm": 1.722366452217102, "kl": 2.8974609375, "learning_rate": 1.9287701119576784e-05, "loss": 0.054, "num_tokens": 19361459.0, "reward": -1.92333984375, "reward_std": 0.5072946548461914, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.5654296875, "rewards/ppl_reward/std": 3.157459020614624, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 135.421875, "completions/mean_terminated_length": 135.421875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.04629911666159, "grad_norm": 2.2755072116851807, "kl": 3.28515625, "learning_rate": 1.928454205996548e-05, "loss": 0.0314, "num_tokens": 19377670.0, "reward": -0.432373046875, "reward_std": 0.6940626502037048, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.47412109375, "rewards/ppl_reward/std": 1.640142560005188, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19654127955436707, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 136.234375, "completions/mean_terminated_length": 136.234375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.0475175144684739, "grad_norm": 3.582327127456665, "kl": 1.47265625, "learning_rate": 1.9281376270471778e-05, "loss": -0.0021, "num_tokens": 19392885.0, "reward": -1.5389404296875, "reward_std": 0.6463751196861267, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.859130859375, "rewards/ppl_reward/std": 3.59519100189209, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 144.125, "completions/mean_terminated_length": 144.125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.048735912275358, "grad_norm": 2.5856826305389404, "kl": 2.2421875, "learning_rate": 1.9278203753390405e-05, "loss": 0.0787, "num_tokens": 19409493.0, "reward": -1.55560302734375, "reward_std": 1.283534288406372, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.8455810546875, "rewards/ppl_reward/std": 4.572335720062256, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 126.640625, "completions/mean_terminated_length": 126.640625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 1.049954310082242, "grad_norm": 2.863675355911255, "kl": 3.21484375, "learning_rate": 1.927502451102095e-05, "loss": 0.0221, "num_tokens": 19424230.0, "reward": -2.58929443359375, "reward_std": 1.029970645904541, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.7332763671875, "rewards/ppl_reward/std": 5.224832057952881, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21007457375526428, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 130.984375, "completions/mean_terminated_length": 130.984375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.0511727078891258, "grad_norm": 4.687244892120361, "kl": 2.74609375, "learning_rate": 1.9271838545667876e-05, "loss": 0.1056, "num_tokens": 19439637.0, "reward": 0.047119140625, "reward_std": 0.5765004754066467, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -3.57763671875, "rewards/ppl_reward/std": 1.0514384508132935, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 141.859375, "completions/mean_terminated_length": 141.859375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 1.0523911056960098, "grad_norm": 7.377728462219238, "kl": 5.29296875, "learning_rate": 1.9268645859640528e-05, "loss": 0.1638, "num_tokens": 19456196.0, "reward": -3.87353515625, "reward_std": 1.0583889484405518, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -11.0283203125, "rewards/ppl_reward/std": 6.721661567687988, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.1985812783241272, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 151.6875, "completions/mean_terminated_length": 151.6875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.0536095035028936, "grad_norm": 5.760741233825684, "kl": 6.40625, "learning_rate": 1.9265446455253117e-05, "loss": 0.2106, "num_tokens": 19472736.0, "reward": -2.3663330078125, "reward_std": 1.5929045677185059, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -8.068603515625, "rewards/ppl_reward/std": 6.854238033294678, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.20859359204769135, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 144.25, "completions/mean_terminated_length": 144.25, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.0548279013097777, "grad_norm": 2.822387218475342, "kl": 5.42578125, "learning_rate": 1.926224033482473e-05, "loss": 0.249, "num_tokens": 19488632.0, "reward": -1.03173828125, "reward_std": 0.7758899927139282, "rewards/format_reward/mean": 0.703125, "rewards/format_reward/std": 0.4604927599430084, "rewards/ppl_reward/mean": -5.2275390625, "rewards/ppl_reward/std": 3.326138734817505, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.18894122540950775, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 154.0, "completions/mean_terminated_length": 154.0, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 1.0560462991166615, "grad_norm": 1.623911738395691, "kl": 1.556640625, "learning_rate": 1.9259027500679312e-05, "loss": 0.0424, "num_tokens": 19504872.0, "reward": -1.173583984375, "reward_std": 0.39501041173934937, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.22998046875, "rewards/ppl_reward/std": 3.7932112216949463, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 157.671875, "completions/mean_terminated_length": 157.671875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 1.0572646969235455, "grad_norm": 1.9136097431182861, "kl": 1.3486328125, "learning_rate": 1.9255807955145677e-05, "loss": 0.0588, "num_tokens": 19521867.0, "reward": -5.65716552734375, "reward_std": 0.7403194904327393, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -15.1502685546875, "rewards/ppl_reward/std": 26.009572982788086, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11672777682542801, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 160.546875, "completions/mean_terminated_length": 160.546875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 1.0584830947304296, "grad_norm": 2.144625186920166, "kl": 1.76171875, "learning_rate": 1.9252581700557512e-05, "loss": -0.0137, "num_tokens": 19539094.0, "reward": -3.701171875, "reward_std": 1.4295036792755127, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -10.94140625, "rewards/ppl_reward/std": 9.774133682250977, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.245463564991951, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 157.828125, "completions/mean_terminated_length": 157.828125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.0597014925373134, "grad_norm": 2.4139723777770996, "kl": 1.767578125, "learning_rate": 1.924934873925336e-05, "loss": -0.0949, "num_tokens": 19555971.0, "reward": -1.1942138671875, "reward_std": 0.9490081071853638, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.810302734375, "rewards/ppl_reward/std": 2.399658679962158, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.24688033759593964, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 148.125, "completions/mean_terminated_length": 148.125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.0609198903441974, "grad_norm": 5.300216197967529, "kl": 2.244140625, "learning_rate": 1.924610907357663e-05, "loss": 0.0058, "num_tokens": 19572323.0, "reward": -1.7381591796875, "reward_std": 0.9849056005477905, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.820068359375, "rewards/ppl_reward/std": 3.6132185459136963, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.24346621334552765, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 147.671875, "completions/mean_terminated_length": 147.671875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.0621382881510812, "grad_norm": 3.3016438484191895, "kl": 4.9921875, "learning_rate": 1.924286270587558e-05, "loss": 0.1647, "num_tokens": 19588486.0, "reward": -1.2305908203125, "reward_std": 0.7696812152862549, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -5.687744140625, "rewards/ppl_reward/std": 2.832442045211792, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2314215451478958, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 138.09375, "completions/mean_terminated_length": 138.09375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 1.0633566859579653, "grad_norm": 1.9550070762634277, "kl": 3.517578125, "learning_rate": 1.9239609638503335e-05, "loss": 0.0399, "num_tokens": 19604156.0, "reward": -2.1510009765625, "reward_std": 1.157555103302002, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -7.591064453125, "rewards/ppl_reward/std": 4.2034478187561035, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.22627368569374084, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 144.765625, "completions/mean_terminated_length": 144.765625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.0645750837648493, "grad_norm": 2.9510512351989746, "kl": 2.80078125, "learning_rate": 1.923634987381788e-05, "loss": 0.0668, "num_tokens": 19620501.0, "reward": -2.297607421875, "reward_std": 1.3356716632843018, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -7.80615234375, "rewards/ppl_reward/std": 4.0411696434021, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.2311534583568573, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 154.890625, "completions/mean_terminated_length": 154.890625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.0657934815717331, "grad_norm": 3.0587687492370605, "kl": 5.75, "learning_rate": 1.9233083414182043e-05, "loss": 0.2062, "num_tokens": 19637702.0, "reward": -1.476806640625, "reward_std": 1.4147623777389526, "rewards/format_reward/mean": 0.609375, "rewards/format_reward/std": 0.4917473793029785, "rewards/ppl_reward/mean": -5.81298828125, "rewards/ppl_reward/std": 4.0043625831604, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.27265870571136475, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 151.03125, "completions/mean_terminated_length": 151.03125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.0670118793786172, "grad_norm": 3.9512624740600586, "kl": 6.2421875, "learning_rate": 1.9229810261963517e-05, "loss": 0.2748, "num_tokens": 19654464.0, "reward": -1.9237060546875, "reward_std": 1.4480807781219482, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.48795005679130554, "rewards/ppl_reward/mean": -6.800537109375, "rewards/ppl_reward/std": 5.136748790740967, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.22589658200740814, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 140.25, "completions/mean_terminated_length": 140.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 1.068230277185501, "grad_norm": 7.994301795959473, "kl": 9.234375, "learning_rate": 1.9226530419534834e-05, "loss": 0.351, "num_tokens": 19670256.0, "reward": -0.9886474609375, "reward_std": 1.2394554615020752, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4787135720252991, "rewards/ppl_reward/mean": -4.946044921875, "rewards/ppl_reward/std": 3.5107345581054688, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.28824523091316223, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 136.390625, "completions/mean_terminated_length": 136.390625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.069448674992385, "grad_norm": 5.642995357513428, "kl": 7.9921875, "learning_rate": 1.9223243889273383e-05, "loss": 0.3354, "num_tokens": 19685905.0, "reward": -3.7105712890625, "reward_std": 1.7717558145523071, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.48795005679130554, "rewards/ppl_reward/mean": -10.335205078125, "rewards/ppl_reward/std": 14.294968605041504, "rewards/tag_count_reward/mean": 0.83203125, "rewards/tag_count_reward/std": 0.2711479663848877, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 149.5625, "completions/mean_terminated_length": 149.5625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.0706670727992689, "grad_norm": 4.205755710601807, "kl": 5.44140625, "learning_rate": 1.9219950673561405e-05, "loss": 0.2428, "num_tokens": 19703125.0, "reward": -1.650146484375, "reward_std": 1.2677874565124512, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -6.51123046875, "rewards/ppl_reward/std": 3.2107837200164795, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.22240428626537323, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 122.359375, "completions/mean_terminated_length": 122.359375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 1.071885470606153, "grad_norm": 2.212420701980591, "kl": 2.873046875, "learning_rate": 1.9216650774785975e-05, "loss": 0.0354, "num_tokens": 19718084.0, "reward": -3.1434326171875, "reward_std": 1.1592732667922974, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -9.833740234375, "rewards/ppl_reward/std": 7.224554061889648, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20638974010944366, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 126.359375, "completions/mean_terminated_length": 126.359375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.073103868413037, "grad_norm": 2.0421411991119385, "kl": 2.212890625, "learning_rate": 1.921334419533902e-05, "loss": 0.0044, "num_tokens": 19732371.0, "reward": -1.400390625, "reward_std": 0.8439112901687622, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.61328125, "rewards/ppl_reward/std": 6.289957523345947, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 134.4375, "completions/mean_terminated_length": 134.4375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 1.0743222662199208, "grad_norm": 1.9545708894729614, "kl": 1.1572265625, "learning_rate": 1.9210030937617303e-05, "loss": 0.0163, "num_tokens": 19748279.0, "reward": -0.7110595703125, "reward_std": 0.09527473151683807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -5.414306640625, "rewards/ppl_reward/std": 5.341888904571533, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 130.0, "completions/mean_terminated_length": 130.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.0755406640268048, "grad_norm": 2.8610799312591553, "kl": 2.6376953125, "learning_rate": 1.920671100402244e-05, "loss": 0.1947, "num_tokens": 19763471.0, "reward": -1.0382080078125, "reward_std": 0.2848612368106842, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.021728515625, "rewards/ppl_reward/std": 3.2705440521240234, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 138.984375, "completions/mean_terminated_length": 138.984375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.0767590618336886, "grad_norm": 2.4653818607330322, "kl": 1.572265625, "learning_rate": 1.920338439696088e-05, "loss": -0.0158, "num_tokens": 19779614.0, "reward": -0.9056396484375, "reward_std": 0.30665817856788635, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.748779296875, "rewards/ppl_reward/std": 2.3333771228790283, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 184.703125, "completions/mean_terminated_length": 184.703125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 1.0779774596405727, "grad_norm": 2.344547748565674, "kl": 6.2900390625, "learning_rate": 1.9200051118843895e-05, "loss": 0.2826, "num_tokens": 19799219.0, "reward": -1.4776611328125, "reward_std": 0.5839763879776001, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.720947265625, "rewards/ppl_reward/std": 3.168675422668457, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1423913538455963, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 242.390625, "completions/mean_terminated_length": 242.390625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.0791958574474565, "grad_norm": 10.173018455505371, "kl": 14.9375, "learning_rate": 1.9196711172087617e-05, "loss": 0.7766, "num_tokens": 19821076.0, "reward": -3.046630859375, "reward_std": 1.229370355606079, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.48795005679130554, "rewards/ppl_reward/mean": -8.85888671875, "rewards/ppl_reward/std": 8.227978706359863, "rewards/tag_count_reward/mean": 0.7578125, "rewards/tag_count_reward/std": 0.28510910272598267, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 333.796875, "completions/mean_terminated_length": 333.796875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.0804142552543405, "grad_norm": 25.75433921813965, "kl": 19.8125, "learning_rate": 1.9193364559112997e-05, "loss": 0.8051, "num_tokens": 19849127.0, "reward": -2.577392578125, "reward_std": 1.9024285078048706, "rewards/format_reward/mean": 0.359375, "rewards/format_reward/std": 0.4836103618144989, "rewards/ppl_reward/mean": -6.99072265625, "rewards/ppl_reward/std": 5.6298322677612305, "rewards/tag_count_reward/mean": 0.55859375, "rewards/tag_count_reward/std": 0.2772533595561981, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 229.9375, "completions/mean_terminated_length": 229.9375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.0816326530612246, "grad_norm": 18.895723342895508, "kl": 14.953125, "learning_rate": 1.919001128234582e-05, "loss": 0.6555, "num_tokens": 19870587.0, "reward": -3.0283203125, "reward_std": 1.8817461729049683, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5029674172401428, "rewards/ppl_reward/mean": -8.345703125, "rewards/ppl_reward/std": 10.545536041259766, "rewards/tag_count_reward/mean": 0.67578125, "rewards/tag_count_reward/std": 0.2943909764289856, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 205.515625, "completions/mean_terminated_length": 205.515625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 1.0828510508681084, "grad_norm": 2.9395370483398438, "kl": 5.11328125, "learning_rate": 1.9186651344216703e-05, "loss": 0.1926, "num_tokens": 19891156.0, "reward": -1.6209716796875, "reward_std": 1.1367157697677612, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.671630859375, "rewards/ppl_reward/std": 4.111666202545166, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.1972891092300415, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 188.75, "completions/mean_terminated_length": 188.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.0840694486749924, "grad_norm": 3.2716314792633057, "kl": 9.4609375, "learning_rate": 1.9183284747161087e-05, "loss": 0.5115, "num_tokens": 19909596.0, "reward": -3.8306884765625, "reward_std": 2.397779703140259, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -10.911376953125, "rewards/ppl_reward/std": 16.59954261779785, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2519763112068176, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 191.671875, "completions/mean_terminated_length": 191.671875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 1.0852878464818763, "grad_norm": 3.3349769115448, "kl": 4.0390625, "learning_rate": 1.917991149361925e-05, "loss": 0.1069, "num_tokens": 19928863.0, "reward": -2.3177490234375, "reward_std": 1.6255980730056763, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -8.026123046875, "rewards/ppl_reward/std": 6.186025142669678, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2002912163734436, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 221.9375, "completions/mean_terminated_length": 221.9375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.0865062442887603, "grad_norm": 2.1870357990264893, "kl": 3.33203125, "learning_rate": 1.9176531586036282e-05, "loss": 0.0219, "num_tokens": 19949955.0, "reward": -2.927734375, "reward_std": 1.0620290040969849, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -9.35546875, "rewards/ppl_reward/std": 5.748143672943115, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2083333432674408, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 224.71875, "completions/mean_terminated_length": 224.71875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.0877246420956443, "grad_norm": 1.9929635524749756, "kl": 2.7890625, "learning_rate": 1.9173145026862106e-05, "loss": -0.0074, "num_tokens": 19970969.0, "reward": -2.3828125, "reward_std": 2.322390556335449, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -8.1953125, "rewards/ppl_reward/std": 7.325310707092285, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2184663861989975, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 229.015625, "completions/mean_terminated_length": 229.015625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.0889430399025282, "grad_norm": 3.006673574447632, "kl": 3.48046875, "learning_rate": 1.916975181855146e-05, "loss": 0.1011, "num_tokens": 19992826.0, "reward": -1.606689453125, "reward_std": 1.3438138961791992, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -6.55712890625, "rewards/ppl_reward/std": 3.4775192737579346, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.20653989911079407, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 213.703125, "completions/mean_terminated_length": 213.703125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.0901614377094122, "grad_norm": 2.016390085220337, "kl": 2.626953125, "learning_rate": 1.916635196356391e-05, "loss": -0.0675, "num_tokens": 20013151.0, "reward": -4.294921875, "reward_std": 0.8804739713668823, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -12.05859375, "rewards/ppl_reward/std": 17.969411849975586, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.22930191457271576, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 204.265625, "completions/mean_terminated_length": 204.265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 1.091379835516296, "grad_norm": 3.151571750640869, "kl": 5.0234375, "learning_rate": 1.916294546436383e-05, "loss": 0.1442, "num_tokens": 20032832.0, "reward": -1.9281005859375, "reward_std": 1.324735403060913, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.467176616191864, "rewards/ppl_reward/mean": -6.981201171875, "rewards/ppl_reward/std": 5.237967491149902, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.24800793826580048, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 176.640625, "completions/mean_terminated_length": 176.640625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 1.09259823332318, "grad_norm": 2.25850772857666, "kl": 4.6875, "learning_rate": 1.9159532323420418e-05, "loss": 0.0836, "num_tokens": 20052233.0, "reward": -2.46124267578125, "reward_std": 1.1087658405303955, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -8.0865478515625, "rewards/ppl_reward/std": 7.006100654602051, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.2706902325153351, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 142.0625, "completions/mean_terminated_length": 142.0625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.0938166311300639, "grad_norm": 2.048306703567505, "kl": 4.3203125, "learning_rate": 1.9156112543207674e-05, "loss": 0.0224, "num_tokens": 20068133.0, "reward": -8.042724609375, "reward_std": 9.895265579223633, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -19.35888671875, "rewards/ppl_reward/std": 56.25490951538086, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.26349374651908875, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 162.6875, "completions/mean_terminated_length": 162.6875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.095035028936948, "grad_norm": 1.9676295518875122, "kl": 4.33203125, "learning_rate": 1.9152686126204432e-05, "loss": 0.0909, "num_tokens": 20085513.0, "reward": -1.46221923828125, "reward_std": 1.0442252159118652, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.3775634765625, "rewards/ppl_reward/std": 3.3453969955444336, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.20518454909324646, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 130.40625, "completions/mean_terminated_length": 130.40625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.0962534267438317, "grad_norm": 64.24768829345703, "kl": 6.9921875, "learning_rate": 1.9149253074894312e-05, "loss": 0.2604, "num_tokens": 20100531.0, "reward": -1.8857421875, "reward_std": 1.925418734550476, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.4732423722743988, "rewards/ppl_reward/mean": -6.826171875, "rewards/ppl_reward/std": 5.113807678222656, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.2551248073577881, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 155.625, "completions/mean_terminated_length": 155.625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.0974718245507158, "grad_norm": 1.913380742073059, "kl": 4.646484375, "learning_rate": 1.914581339176576e-05, "loss": 0.0908, "num_tokens": 20117635.0, "reward": -1.539306640625, "reward_std": 1.196854591369629, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.65673828125, "rewards/ppl_reward/std": 3.8045644760131836, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 139.953125, "completions/mean_terminated_length": 139.953125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.0986902223575998, "grad_norm": 3.8683111667633057, "kl": 7.1171875, "learning_rate": 1.9142367079312023e-05, "loss": 0.2366, "num_tokens": 20133832.0, "reward": -3.375, "reward_std": 3.118948221206665, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -10.0078125, "rewards/ppl_reward/std": 11.65920352935791, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2398419976234436, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 133.21875, "completions/mean_terminated_length": 133.21875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 1.0999086201644837, "grad_norm": 1.9723788499832153, "kl": 2.9052734375, "learning_rate": 1.9138914140031154e-05, "loss": 0.1306, "num_tokens": 20148798.0, "reward": -3.8331298828125, "reward_std": 0.8747090101242065, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -11.314697265625, "rewards/ppl_reward/std": 12.333883285522461, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.1011270179713677, "grad_norm": 2.3839070796966553, "kl": 3.306640625, "learning_rate": 1.913545457642601e-05, "loss": 0.1218, "num_tokens": 20165886.0, "reward": -0.9814453125, "reward_std": 0.9067567586898804, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.361328125, "rewards/ppl_reward/std": 2.258366346359253, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.25046461820602417, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 138.203125, "completions/mean_terminated_length": 138.203125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.1023454157782515, "grad_norm": 2.815270185470581, "kl": 4.876953125, "learning_rate": 1.913198839100425e-05, "loss": 0.0693, "num_tokens": 20181707.0, "reward": -1.5494384765625, "reward_std": 1.3856072425842285, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.434814453125, "rewards/ppl_reward/std": 4.152777671813965, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.27433067560195923, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 137.78125, "completions/mean_terminated_length": 137.78125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.1035638135851356, "grad_norm": 3.4523708820343018, "kl": 6.38671875, "learning_rate": 1.912851558627833e-05, "loss": 0.2721, "num_tokens": 20197973.0, "reward": -1.1729736328125, "reward_std": 1.1297705173492432, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -5.658447265625, "rewards/ppl_reward/std": 3.2558302879333496, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.24346621334552765, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 130.09375, "completions/mean_terminated_length": 130.09375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.1047822113920196, "grad_norm": 4.127015113830566, "kl": 4.5859375, "learning_rate": 1.9125036164765502e-05, "loss": 0.0784, "num_tokens": 20213787.0, "reward": -1.67041015625, "reward_std": 1.0446925163269043, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.8798828125, "rewards/ppl_reward/std": 4.355757713317871, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1925172060728073, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 128.234375, "completions/mean_terminated_length": 128.234375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.1060006091989034, "grad_norm": 4.194327354431152, "kl": 6.1796875, "learning_rate": 1.9121550128987824e-05, "loss": 0.2819, "num_tokens": 20229466.0, "reward": -0.94091796875, "reward_std": 0.9301973581314087, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -5.1552734375, "rewards/ppl_reward/std": 2.118945837020874, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.22240428626537323, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 135.53125, "completions/mean_terminated_length": 135.53125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.1072190070057875, "grad_norm": 2.038443088531494, "kl": 5.5234375, "learning_rate": 1.911805748147214e-05, "loss": 0.1637, "num_tokens": 20245948.0, "reward": -3.10662841796875, "reward_std": 2.5578205585479736, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.467176616191864, "rewards/ppl_reward/mean": -9.2913818359375, "rewards/ppl_reward/std": 13.318527221679688, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.2662152051925659, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 122.375, "completions/mean_terminated_length": 122.375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 1.1084374048126713, "grad_norm": 3.4335665702819824, "kl": 2.802734375, "learning_rate": 1.911455822475009e-05, "loss": 0.0158, "num_tokens": 20261100.0, "reward": -3.09100341796875, "reward_std": 1.3327478170394897, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -9.8148193359375, "rewards/ppl_reward/std": 10.315794944763184, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 117.40625, "completions/mean_terminated_length": 117.40625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.1096558026195553, "grad_norm": 4.836190700531006, "kl": 4.8359375, "learning_rate": 1.9111052361358102e-05, "loss": 0.2578, "num_tokens": 20275214.0, "reward": -1.2911376953125, "reward_std": 0.7533591985702515, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -5.894775390625, "rewards/ppl_reward/std": 3.441627025604248, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.19669894874095917, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 129.0625, "completions/mean_terminated_length": 129.0625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.1108742004264391, "grad_norm": 2.9707705974578857, "kl": 2.1796875, "learning_rate": 1.9107539893837396e-05, "loss": 0.0037, "num_tokens": 20290282.0, "reward": -2.19287109375, "reward_std": 1.020554542541504, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.0654296875, "rewards/ppl_reward/std": 8.998295783996582, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09827063977718353, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 124.046875, "completions/mean_terminated_length": 124.046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.1120925982333232, "grad_norm": 2.525697708129883, "kl": 2.14453125, "learning_rate": 1.9104020824733975e-05, "loss": -0.0573, "num_tokens": 20304685.0, "reward": -0.7432861328125, "reward_std": 0.8198514580726624, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.197509765625, "rewards/ppl_reward/std": 3.18654465675354, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 147.46875, "completions/mean_terminated_length": 147.46875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.1133109960402072, "grad_norm": 3.1010758876800537, "kl": 2.650390625, "learning_rate": 1.9100495156598632e-05, "loss": 0.1536, "num_tokens": 20321763.0, "reward": -0.9796142578125, "reward_std": 0.5379403233528137, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.654541015625, "rewards/ppl_reward/std": 2.788426399230957, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14471296966075897, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 130.65625, "completions/mean_terminated_length": 130.65625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.114529393847091, "grad_norm": 1.7900551557540894, "kl": 1.0771484375, "learning_rate": 1.909696289198694e-05, "loss": -0.0739, "num_tokens": 20336317.0, "reward": -2.8505859375, "reward_std": 0.8370465040206909, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -9.623046875, "rewards/ppl_reward/std": 12.781116485595703, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 125.0, "completions/mean_terminated_length": 125.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.115747791653975, "grad_norm": 1.6170685291290283, "kl": 2.265625, "learning_rate": 1.909342403345925e-05, "loss": 0.0231, "num_tokens": 20350997.0, "reward": -0.4627685546875, "reward_std": 0.4621464014053345, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.784912109375, "rewards/ppl_reward/std": 2.11482834815979, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 124.75, "completions/mean_terminated_length": 124.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.116966189460859, "grad_norm": 4.8916096687316895, "kl": 5.484375, "learning_rate": 1.9089878583580704e-05, "loss": 0.2166, "num_tokens": 20365629.0, "reward": -0.45947265625, "reward_std": 1.4165825843811035, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -4.3642578125, "rewards/ppl_reward/std": 3.7046477794647217, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1925172060728073, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 124.765625, "completions/mean_terminated_length": 124.765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.118184587267743, "grad_norm": 4.101722240447998, "kl": 5.40625, "learning_rate": 1.9086326544921206e-05, "loss": 0.1559, "num_tokens": 20380102.0, "reward": -1.1051025390625, "reward_std": 0.794858992099762, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.694580078125, "rewards/ppl_reward/std": 1.7664817571640015, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.20518454909324646, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 137.84375, "completions/mean_terminated_length": 137.84375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 1.1194029850746268, "grad_norm": 4.291865825653076, "kl": 6.8828125, "learning_rate": 1.9082767920055454e-05, "loss": 0.2559, "num_tokens": 20396060.0, "reward": -2.1015625, "reward_std": 1.5872914791107178, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -7.515625, "rewards/ppl_reward/std": 5.121405601501465, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.27094778418540955, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 135.421875, "completions/mean_terminated_length": 135.421875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 1.1206213828815108, "grad_norm": 2.9835753440856934, "kl": 5.92578125, "learning_rate": 1.9079202711562903e-05, "loss": 0.1855, "num_tokens": 20411159.0, "reward": -1.6707763671875, "reward_std": 2.0475802421569824, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.732177734375, "rewards/ppl_reward/std": 4.610541343688965, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.31487196683883667, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 139.921875, "completions/mean_terminated_length": 139.921875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.1218397806883948, "grad_norm": 2.5709850788116455, "kl": 5.787109375, "learning_rate": 1.907563092202779e-05, "loss": 0.2101, "num_tokens": 20427178.0, "reward": -2.5701904296875, "reward_std": 1.7023470401763916, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -8.406005859375, "rewards/ppl_reward/std": 5.59154748916626, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.2708333432674408, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 126.828125, "completions/mean_terminated_length": 126.828125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.1230581784952787, "grad_norm": 2.2941014766693115, "kl": 4.6796875, "learning_rate": 1.9072052554039123e-05, "loss": 0.1105, "num_tokens": 20442551.0, "reward": -2.91064453125, "reward_std": 2.850393533706665, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -9.2041015625, "rewards/ppl_reward/std": 11.201865196228027, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2816905975341797, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 126.953125, "completions/mean_terminated_length": 126.953125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 1.1242765763021627, "grad_norm": 1.929494857788086, "kl": 4.208984375, "learning_rate": 1.906846761019067e-05, "loss": 0.0414, "num_tokens": 20457276.0, "reward": -1.8941650390625, "reward_std": 1.193122148513794, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.335205078125, "rewards/ppl_reward/std": 4.142136096954346, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19654127955436707, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 150.703125, "completions/mean_terminated_length": 150.703125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.1254949741090465, "grad_norm": 2.3521337509155273, "kl": 4.013671875, "learning_rate": 1.906487609308097e-05, "loss": 0.0994, "num_tokens": 20474241.0, "reward": -1.484130859375, "reward_std": 0.9656392931938171, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.44482421875, "rewards/ppl_reward/std": 4.267928123474121, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1763816624879837, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 129.203125, "completions/mean_terminated_length": 129.203125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.1267133719159306, "grad_norm": 2.750481605529785, "kl": 2.46484375, "learning_rate": 1.906127800531333e-05, "loss": 0.0727, "num_tokens": 20489342.0, "reward": -4.227783203125, "reward_std": 2.671221971511841, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -12.33056640625, "rewards/ppl_reward/std": 15.785529136657715, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 129.953125, "completions/mean_terminated_length": 129.953125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.1279317697228146, "grad_norm": 2.205768346786499, "kl": 2.1015625, "learning_rate": 1.905767334949582e-05, "loss": -0.0687, "num_tokens": 20504219.0, "reward": -1.00592041015625, "reward_std": 0.6100186109542847, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.7149658203125, "rewards/ppl_reward/std": 3.943772077560425, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 149.890625, "completions/mean_terminated_length": 149.890625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 1.1291501675296984, "grad_norm": 2.278486967086792, "kl": 5.6943359375, "learning_rate": 1.9054062128241263e-05, "loss": 0.2125, "num_tokens": 20521052.0, "reward": -1.7401123046875, "reward_std": 0.6077929735183716, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.988037109375, "rewards/ppl_reward/std": 5.1096391677856445, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.18729320168495178, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 137.5625, "completions/mean_terminated_length": 137.5625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.1303685653365825, "grad_norm": 2.112241506576538, "kl": 5.265625, "learning_rate": 1.905044434416725e-05, "loss": 0.1631, "num_tokens": 20536360.0, "reward": -1.564697265625, "reward_std": 1.2113094329833984, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.65283203125, "rewards/ppl_reward/std": 3.8348388671875, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1620931327342987, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 137.4375, "completions/mean_terminated_length": 137.4375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.1315869631434663, "grad_norm": 1.8495246171951294, "kl": 3.25, "learning_rate": 1.9046819999896117e-05, "loss": 0.072, "num_tokens": 20551716.0, "reward": -1.796142578125, "reward_std": 1.2408843040466309, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.25634765625, "rewards/ppl_reward/std": 4.472667217254639, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 153.109375, "completions/mean_terminated_length": 153.109375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.1328053609503503, "grad_norm": 2.906674861907959, "kl": 6.5, "learning_rate": 1.9043189098054974e-05, "loss": 0.2287, "num_tokens": 20568571.0, "reward": -1.177978515625, "reward_std": 1.5884333848953247, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.72314453125, "rewards/ppl_reward/std": 5.430988311767578, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.2311534583568573, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 158.828125, "completions/mean_terminated_length": 158.828125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.1340237587572342, "grad_norm": 3.145538330078125, "kl": 7.609375, "learning_rate": 1.9039551641275664e-05, "loss": 0.2513, "num_tokens": 20586464.0, "reward": -1.23095703125, "reward_std": 1.0823233127593994, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -5.7431640625, "rewards/ppl_reward/std": 3.9622013568878174, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.19158901274204254, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 156.828125, "completions/mean_terminated_length": 156.828125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.1352421565641182, "grad_norm": 2.125723361968994, "kl": 4.3251953125, "learning_rate": 1.903590763219479e-05, "loss": 0.0965, "num_tokens": 20603269.0, "reward": -2.815673828125, "reward_std": 1.5529024600982666, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -9.07666015625, "rewards/ppl_reward/std": 6.680722236633301, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.20590098202228546, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 137.6875, "completions/mean_terminated_length": 137.6875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.136460554371002, "grad_norm": 2.226729393005371, "kl": 3.98828125, "learning_rate": 1.9032257073453718e-05, "loss": 0.089, "num_tokens": 20619049.0, "reward": -5.056884765625, "reward_std": 2.4761290550231934, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -13.39501953125, "rewards/ppl_reward/std": 13.19287109375, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.22658175230026245, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 147.0625, "completions/mean_terminated_length": 147.0625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.137678952177886, "grad_norm": 2.479822874069214, "kl": 3.91796875, "learning_rate": 1.9028599967698533e-05, "loss": 0.0472, "num_tokens": 20635325.0, "reward": -1.515625, "reward_std": 1.1484148502349854, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.467176616191864, "rewards/ppl_reward/mean": -6.171875, "rewards/ppl_reward/std": 4.930459022521973, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.22699186205863953, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 159.375, "completions/mean_terminated_length": 159.375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.13889734998477, "grad_norm": 4.066699981689453, "kl": 5.7421875, "learning_rate": 1.9024936317580086e-05, "loss": 0.1107, "num_tokens": 20654333.0, "reward": -3.41082763671875, "reward_std": 2.1108975410461426, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.4732423722743988, "rewards/ppl_reward/mean": -9.9075927734375, "rewards/ppl_reward/std": 6.773366451263428, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.22709426283836365, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 146.609375, "completions/mean_terminated_length": 146.609375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.140115747791654, "grad_norm": 3.2378945350646973, "kl": 2.923828125, "learning_rate": 1.902126612575397e-05, "loss": 0.013, "num_tokens": 20670772.0, "reward": -0.7498779296875, "reward_std": 1.258345603942871, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -4.741943359375, "rewards/ppl_reward/std": 3.340709924697876, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.22682106494903564, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 148.921875, "completions/mean_terminated_length": 148.921875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.141334145598538, "grad_norm": 3.526782512664795, "kl": 4.19140625, "learning_rate": 1.9017589394880515e-05, "loss": 0.0996, "num_tokens": 20687823.0, "reward": -1.5067138671875, "reward_std": 1.7129461765289307, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -6.286865234375, "rewards/ppl_reward/std": 6.548304080963135, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.22240428626537323, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 139.640625, "completions/mean_terminated_length": 139.640625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 1.1425525434054218, "grad_norm": 2.610985279083252, "kl": 3.5625, "learning_rate": 1.9013906127624793e-05, "loss": 0.0244, "num_tokens": 20703272.0, "reward": -2.77099609375, "reward_std": 1.9838218688964844, "rewards/format_reward/mean": 0.703125, "rewards/format_reward/std": 0.4604927599430084, "rewards/ppl_reward/mean": -8.6748046875, "rewards/ppl_reward/std": 5.441060543060303, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.24772652983665466, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 152.65625, "completions/mean_terminated_length": 152.65625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.1437709412123058, "grad_norm": 3.351353168487549, "kl": 2.9375, "learning_rate": 1.901021632665661e-05, "loss": -0.0068, "num_tokens": 20720210.0, "reward": -0.2509765625, "reward_std": 0.6327069997787476, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -3.869140625, "rewards/ppl_reward/std": 0.7784399390220642, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.18926911056041718, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 151.5, "completions/mean_terminated_length": 151.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.1449893390191899, "grad_norm": 2.571305751800537, "kl": 2.80859375, "learning_rate": 1.9006519994650516e-05, "loss": -0.0193, "num_tokens": 20736274.0, "reward": -1.859619140625, "reward_std": 1.6584250926971436, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -7.07080078125, "rewards/ppl_reward/std": 5.855388164520264, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.23906518518924713, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 187.828125, "completions/mean_terminated_length": 187.828125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 1.1462077368260737, "grad_norm": 1.8235045671463013, "kl": 1.6025390625, "learning_rate": 1.900281713428578e-05, "loss": -0.0047, "num_tokens": 20756079.0, "reward": -0.6192626953125, "reward_std": 0.4704533517360687, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.074462890625, "rewards/ppl_reward/std": 1.432273507118225, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 164.359375, "completions/mean_terminated_length": 164.359375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.1474261346329577, "grad_norm": 1.5679899454116821, "kl": 1.15234375, "learning_rate": 1.8999107748246427e-05, "loss": 0.0148, "num_tokens": 20773414.0, "reward": -1.4168701171875, "reward_std": 0.46353065967559814, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.685302734375, "rewards/ppl_reward/std": 5.268205642700195, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 171.453125, "completions/mean_terminated_length": 171.453125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.1486445324398415, "grad_norm": 1.778481364250183, "kl": 2.58984375, "learning_rate": 1.899539183922119e-05, "loss": -0.0442, "num_tokens": 20791051.0, "reward": -4.7998046875, "reward_std": 4.40079402923584, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -13.208984375, "rewards/ppl_reward/std": 21.374771118164062, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.2291666716337204, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 155.21875, "completions/mean_terminated_length": 155.21875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 1.1498629302467256, "grad_norm": 1.622607946395874, "kl": 2.82421875, "learning_rate": 1.8991669409903538e-05, "loss": -0.0175, "num_tokens": 20807721.0, "reward": -0.8330078125, "reward_std": 0.8926883339881897, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.244140625, "rewards/ppl_reward/std": 2.347991466522217, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.25283610820770264, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 150.71875, "completions/mean_terminated_length": 150.71875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.1510813280536094, "grad_norm": 3.4889252185821533, "kl": 6.244140625, "learning_rate": 1.8987940462991673e-05, "loss": 0.1695, "num_tokens": 20824887.0, "reward": -4.115234375, "reward_std": 2.2646613121032715, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -11.43359375, "rewards/ppl_reward/std": 11.744951248168945, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.3604140877723694, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 144.15625, "completions/mean_terminated_length": 144.15625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 1.1522997258604935, "grad_norm": 3.600647211074829, "kl": 7.296875, "learning_rate": 1.8984205001188506e-05, "loss": 0.0847, "num_tokens": 20841833.0, "reward": -2.02752685546875, "reward_std": 2.389115810394287, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -6.9847412109375, "rewards/ppl_reward/std": 8.646812438964844, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.4237310290336609, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 140.046875, "completions/mean_terminated_length": 140.046875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.1535181236673775, "grad_norm": 3.9318106174468994, "kl": 6.6875, "learning_rate": 1.8980463027201685e-05, "loss": 0.0886, "num_tokens": 20858268.0, "reward": -1.9833984375, "reward_std": 1.8883156776428223, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -7.044921875, "rewards/ppl_reward/std": 4.006453514099121, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.3712611496448517, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 148.53125, "completions/mean_terminated_length": 148.53125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.1547365214742613, "grad_norm": 4.910560607910156, "kl": 8.3125, "learning_rate": 1.8976714543743574e-05, "loss": 0.2741, "num_tokens": 20874966.0, "reward": -0.864501953125, "reward_std": 1.1787464618682861, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -4.90869140625, "rewards/ppl_reward/std": 2.3845462799072266, "rewards/tag_count_reward/mean": 0.82421875, "rewards/tag_count_reward/std": 0.35823556780815125, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 130.0625, "completions/mean_terminated_length": 130.0625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.1559549192811454, "grad_norm": 3.1406662464141846, "kl": 6.71875, "learning_rate": 1.897295955353125e-05, "loss": 0.1306, "num_tokens": 20890962.0, "reward": -1.3685302734375, "reward_std": 1.4186229705810547, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -5.737060546875, "rewards/ppl_reward/std": 2.124824285507202, "rewards/tag_count_reward/mean": 0.78125, "rewards/tag_count_reward/std": 0.38188132643699646, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 135.46875, "completions/mean_terminated_length": 135.46875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.1571733170880292, "grad_norm": 2.301722764968872, "kl": 3.484375, "learning_rate": 1.8969198059286515e-05, "loss": 0.0999, "num_tokens": 20906464.0, "reward": -0.83642578125, "reward_std": 1.0317174196243286, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.1494140625, "rewards/ppl_reward/std": 2.7586541175842285, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.224347323179245, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 128.5, "completions/mean_terminated_length": 128.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.1583917148949132, "grad_norm": 3.2180886268615723, "kl": 4.8125, "learning_rate": 1.8965430063735873e-05, "loss": 0.3018, "num_tokens": 20921376.0, "reward": -0.9661865234375, "reward_std": 0.6598356366157532, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.393310546875, "rewards/ppl_reward/std": 2.1742594242095947, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2043897658586502, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 122.328125, "completions/mean_terminated_length": 122.328125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 1.159610112701797, "grad_norm": 2.5414557456970215, "kl": 5.515625, "learning_rate": 1.8961655569610557e-05, "loss": 0.192, "num_tokens": 20935813.0, "reward": -2.60009765625, "reward_std": 3.288857936859131, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -8.5048828125, "rewards/ppl_reward/std": 9.227522850036621, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.27433067560195923, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 129.71875, "completions/mean_terminated_length": 129.71875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 1.160828510508681, "grad_norm": 1.8564426898956299, "kl": 4.4208984375, "learning_rate": 1.89578745796465e-05, "loss": 0.1843, "num_tokens": 20950483.0, "reward": -1.5653076171875, "reward_std": 0.9190698862075806, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.661865234375, "rewards/ppl_reward/std": 5.174370765686035, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 129.75, "completions/mean_terminated_length": 129.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 1.1620469083155651, "grad_norm": 2.697749614715576, "kl": 1.8896484375, "learning_rate": 1.8954087096584338e-05, "loss": 0.0489, "num_tokens": 20966851.0, "reward": -2.756591796875, "reward_std": 1.0276288986206055, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -9.31005859375, "rewards/ppl_reward/std": 6.944423675537109, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15570341050624847, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 119.125, "completions/mean_terminated_length": 119.125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.163265306122449, "grad_norm": 1.764503836631775, "kl": 3.8515625, "learning_rate": 1.8950293123169427e-05, "loss": 0.1019, "num_tokens": 20981091.0, "reward": -1.5701904296875, "reward_std": 1.9156696796417236, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.757568359375, "rewards/ppl_reward/std": 6.381625652313232, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1489359587430954, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 134.25, "completions/mean_terminated_length": 134.25, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.164483703929333, "grad_norm": 2.2134621143341064, "kl": 4.6875, "learning_rate": 1.8946492662151822e-05, "loss": 0.1762, "num_tokens": 20996427.0, "reward": -2.387451171875, "reward_std": 0.8098407983779907, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.39990234375, "rewards/ppl_reward/std": 6.351088523864746, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14689241349697113, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 144.234375, "completions/mean_terminated_length": 144.234375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.1657021017362168, "grad_norm": 2.4227895736694336, "kl": 4.046875, "learning_rate": 1.894268571628628e-05, "loss": 0.0696, "num_tokens": 21014546.0, "reward": -0.802734375, "reward_std": 1.4608702659606934, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.18359375, "rewards/ppl_reward/std": 3.810408592224121, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.244862899184227, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 133.078125, "completions/mean_terminated_length": 133.078125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.1669204995431008, "grad_norm": 7.107782363891602, "kl": 3.896484375, "learning_rate": 1.8938872288332254e-05, "loss": 0.1255, "num_tokens": 21030407.0, "reward": -1.2098388671875, "reward_std": 0.8895694017410278, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.044677734375, "rewards/ppl_reward/std": 2.820680856704712, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18898223340511322, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 130.53125, "completions/mean_terminated_length": 130.53125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.1681388973499849, "grad_norm": 8.056589126586914, "kl": 5.60546875, "learning_rate": 1.8935052381053902e-05, "loss": 0.1875, "num_tokens": 21045377.0, "reward": -2.0487060546875, "reward_std": 1.1906518936157227, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.753662109375, "rewards/ppl_reward/std": 5.944178104400635, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 135.109375, "completions/mean_terminated_length": 135.109375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.1693572951568687, "grad_norm": 4.794267177581787, "kl": 7.9375, "learning_rate": 1.893122599722008e-05, "loss": 0.2725, "num_tokens": 21060888.0, "reward": -1.173583984375, "reward_std": 1.1406996250152588, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.86279296875, "rewards/ppl_reward/std": 2.9075145721435547, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.21463678777217865, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 129.609375, "completions/mean_terminated_length": 129.609375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 1.1705756929637527, "grad_norm": 5.818626403808594, "kl": 6.54296875, "learning_rate": 1.8927393139604327e-05, "loss": 0.1714, "num_tokens": 21076167.0, "reward": -2.6541748046875, "reward_std": 1.3070392608642578, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -8.722412109375, "rewards/ppl_reward/std": 6.166228771209717, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.23061636090278625, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 126.5625, "completions/mean_terminated_length": 126.5625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.1717940907706366, "grad_norm": 5.637094497680664, "kl": 7.28125, "learning_rate": 1.8923553810984893e-05, "loss": 0.2111, "num_tokens": 21091083.0, "reward": -2.6776123046875, "reward_std": 1.7529053688049316, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -8.761474609375, "rewards/ppl_reward/std": 8.893563270568848, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.19158901274204254, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 132.171875, "completions/mean_terminated_length": 132.171875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.1730124885775206, "grad_norm": 5.453283786773682, "kl": 5.916015625, "learning_rate": 1.8919708014144698e-05, "loss": 0.2629, "num_tokens": 21106094.0, "reward": -1.4454345703125, "reward_std": 0.946445643901825, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.453369140625, "rewards/ppl_reward/std": 5.2121381759643555, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2083333432674408, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 136.03125, "completions/mean_terminated_length": 136.03125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.1742308863844044, "grad_norm": 2.089508295059204, "kl": 4.623046875, "learning_rate": 1.8915855751871364e-05, "loss": 0.0673, "num_tokens": 21121968.0, "reward": -1.79296875, "reward_std": 0.8507305979728699, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.125, "rewards/ppl_reward/std": 3.3009908199310303, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.18191926181316376, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 132.203125, "completions/mean_terminated_length": 132.203125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.1754492841912885, "grad_norm": 2.635544538497925, "kl": 5.50390625, "learning_rate": 1.89119970269572e-05, "loss": 0.175, "num_tokens": 21137173.0, "reward": -5.052001953125, "reward_std": 4.324289321899414, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -13.53369140625, "rewards/ppl_reward/std": 30.383825302124023, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.20710203051567078, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 123.46875, "completions/mean_terminated_length": 123.46875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 1.1766676819981723, "grad_norm": 2.1356234550476074, "kl": 2.6953125, "learning_rate": 1.8908131842199192e-05, "loss": -0.0197, "num_tokens": 21151683.0, "reward": -1.0413818359375, "reward_std": 1.1275355815887451, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.801513671875, "rewards/ppl_reward/std": 3.499375343322754, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 140.921875, "completions/mean_terminated_length": 140.921875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.1778860798050563, "grad_norm": 2.9401090145111084, "kl": 4.037109375, "learning_rate": 1.890426020039901e-05, "loss": 0.0945, "num_tokens": 21168030.0, "reward": -0.918701171875, "reward_std": 0.8013306260108948, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.37646484375, "rewards/ppl_reward/std": 3.2445805072784424, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.18191926181316376, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 136.890625, "completions/mean_terminated_length": 136.890625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.1791044776119404, "grad_norm": 2.817436933517456, "kl": 4.58984375, "learning_rate": 1.8900382104363008e-05, "loss": 0.1527, "num_tokens": 21183983.0, "reward": -5.08740234375, "reward_std": 1.2542994022369385, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -13.7060546875, "rewards/ppl_reward/std": 23.366146087646484, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17251639068126678, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 134.203125, "completions/mean_terminated_length": 134.203125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.1803228754188242, "grad_norm": 2.9540252685546875, "kl": 3.6884765625, "learning_rate": 1.889649755690221e-05, "loss": 0.1125, "num_tokens": 21200076.0, "reward": -0.45068359375, "reward_std": 0.7577577829360962, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.5263671875, "rewards/ppl_reward/std": 1.861506700515747, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1836577206850052, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 154.046875, "completions/mean_terminated_length": 154.046875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.1815412732257082, "grad_norm": 2.2323691844940186, "kl": 3.4375, "learning_rate": 1.8892606560832335e-05, "loss": 0.193, "num_tokens": 21219247.0, "reward": -1.2811279296875, "reward_std": 0.43786728382110596, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.382568359375, "rewards/ppl_reward/std": 3.7513792514801025, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 132.484375, "completions/mean_terminated_length": 132.484375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.182759671032592, "grad_norm": 2.537565231323242, "kl": 3.345703125, "learning_rate": 1.888870911897376e-05, "loss": 0.0871, "num_tokens": 21234598.0, "reward": -1.1024169921875, "reward_std": 0.7994456887245178, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.931396484375, "rewards/ppl_reward/std": 3.2739334106445312, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13768701255321503, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 140.8125, "completions/mean_terminated_length": 140.8125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.183978068839476, "grad_norm": 2.0177600383758545, "kl": 2.04296875, "learning_rate": 1.888480523415153e-05, "loss": 0.0801, "num_tokens": 21250602.0, "reward": -1.251220703125, "reward_std": 0.6200394630432129, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.34619140625, "rewards/ppl_reward/std": 2.679507255554199, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 111.296875, "completions/mean_terminated_length": 111.296875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 1.1851964666463601, "grad_norm": 2.4568684101104736, "kl": 4.791015625, "learning_rate": 1.888089490919538e-05, "loss": 0.0951, "num_tokens": 21264141.0, "reward": -4.8751220703125, "reward_std": 3.771918773651123, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -13.367431640625, "rewards/ppl_reward/std": 28.100189208984375, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15545432269573212, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 123.390625, "completions/mean_terminated_length": 123.390625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 1.186414864453244, "grad_norm": 2.3942883014678955, "kl": 4.201171875, "learning_rate": 1.887697814693969e-05, "loss": 0.0506, "num_tokens": 21278894.0, "reward": -2.4180908203125, "reward_std": 2.9347469806671143, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.359619140625, "rewards/ppl_reward/std": 15.770853996276855, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2043897658586502, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 119.359375, "completions/mean_terminated_length": 119.359375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 1.187633262260128, "grad_norm": 2.732501268386841, "kl": 2.97265625, "learning_rate": 1.8873054950223527e-05, "loss": 0.1072, "num_tokens": 21293325.0, "reward": -2.53497314453125, "reward_std": 0.6526333093643188, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.8668212890625, "rewards/ppl_reward/std": 6.343470573425293, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1423913538455963, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 130.46875, "completions/mean_terminated_length": 130.46875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.1888516600670118, "grad_norm": 2.2749857902526855, "kl": 3.826171875, "learning_rate": 1.886912532189061e-05, "loss": 0.0893, "num_tokens": 21308851.0, "reward": -1.05218505859375, "reward_std": 0.7536875009536743, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.8778076171875, "rewards/ppl_reward/std": 2.9413459300994873, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.19507674872875214, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 118.734375, "completions/mean_terminated_length": 118.734375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.1900700578738959, "grad_norm": 2.4443113803863525, "kl": 5.353515625, "learning_rate": 1.886518926478932e-05, "loss": 0.2414, "num_tokens": 21322738.0, "reward": -1.873779296875, "reward_std": 0.6135172843933105, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.48974609375, "rewards/ppl_reward/std": 4.7372236251831055, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1677328199148178, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 117.90625, "completions/mean_terminated_length": 117.90625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.1912884556807797, "grad_norm": 6.643538475036621, "kl": 8.80859375, "learning_rate": 1.88612467817727e-05, "loss": 0.2897, "num_tokens": 21337428.0, "reward": -3.369384765625, "reward_std": 2.0310935974121094, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -10.21533203125, "rewards/ppl_reward/std": 10.802193641662598, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.24138814210891724, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 128.921875, "completions/mean_terminated_length": 128.921875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 1.1925068534876637, "grad_norm": 2.804372787475586, "kl": 7.27734375, "learning_rate": 1.8857297875698455e-05, "loss": 0.3219, "num_tokens": 21353415.0, "reward": -0.864013671875, "reward_std": 0.76275634765625, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.29052734375, "rewards/ppl_reward/std": 1.6207870244979858, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.22658175230026245, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 111.984375, "completions/mean_terminated_length": 111.984375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.1937252512945478, "grad_norm": 5.124625205993652, "kl": 9.0703125, "learning_rate": 1.885334254942894e-05, "loss": 0.275, "num_tokens": 21367910.0, "reward": -0.470458984375, "reward_std": 1.3557475805282593, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -4.37841796875, "rewards/ppl_reward/std": 3.5700502395629883, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2083333432674408, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 113.390625, "completions/mean_terminated_length": 113.390625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.1949436491014316, "grad_norm": 2.771878957748413, "kl": 6.19140625, "learning_rate": 1.884938080583117e-05, "loss": 0.2577, "num_tokens": 21381895.0, "reward": -2.2906494140625, "reward_std": 0.93719482421875, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.206298828125, "rewards/ppl_reward/std": 7.308290958404541, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 114.640625, "completions/mean_terminated_length": 114.640625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.1961620469083156, "grad_norm": 3.2768731117248535, "kl": 6.9375, "learning_rate": 1.8845412647776795e-05, "loss": 0.455, "num_tokens": 21396168.0, "reward": -4.53564453125, "reward_std": 1.5627999305725098, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -12.7822265625, "rewards/ppl_reward/std": 15.177066802978516, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10789459943771362, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 116.09375, "completions/mean_terminated_length": 116.09375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 1.1973804447151994, "grad_norm": 2.7375051975250244, "kl": 4.66015625, "learning_rate": 1.8841438078142136e-05, "loss": 0.1853, "num_tokens": 21411230.0, "reward": -1.763427734375, "reward_std": 2.0492758750915527, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.11279296875, "rewards/ppl_reward/std": 7.196657180786133, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.14211894571781158, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 108.53125, "completions/mean_terminated_length": 108.53125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.1985988425220835, "grad_norm": 4.03561544418335, "kl": 6.328125, "learning_rate": 1.8837457099808155e-05, "loss": 0.3729, "num_tokens": 21425104.0, "reward": -1.07958984375, "reward_std": 0.7301881313323975, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.7607421875, "rewards/ppl_reward/std": 1.551416277885437, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13768701255321503, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 95.671875, "completions/mean_terminated_length": 95.671875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.1998172403289673, "grad_norm": 3.1127729415893555, "kl": 5.0830078125, "learning_rate": 1.883346971566045e-05, "loss": 0.2208, "num_tokens": 21437875.0, "reward": -1.0435791015625, "reward_std": 0.956811249256134, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.743408203125, "rewards/ppl_reward/std": 4.581871509552002, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 123.328125, "completions/mean_terminated_length": 123.328125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 1.2010356381358513, "grad_norm": 2.4096875190734863, "kl": 6.1328125, "learning_rate": 1.8829475928589272e-05, "loss": 0.342, "num_tokens": 21454400.0, "reward": -1.6123046875, "reward_std": 1.12855863571167, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.529296875, "rewards/ppl_reward/std": 3.9587323665618896, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.27455660700798035, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 101.21875, "completions/mean_terminated_length": 101.21875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 1.2022540359427354, "grad_norm": 2.356342077255249, "kl": 3.060546875, "learning_rate": 1.8825475741489504e-05, "loss": 0.078, "num_tokens": 21468110.0, "reward": -2.465087890625, "reward_std": 1.3252627849578857, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.65673828125, "rewards/ppl_reward/std": 9.172990798950195, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 109.78125, "completions/mean_terminated_length": 109.78125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.2034724337496192, "grad_norm": 3.442685842514038, "kl": 4.53125, "learning_rate": 1.8821469157260687e-05, "loss": 0.2634, "num_tokens": 21482408.0, "reward": -0.5423583984375, "reward_std": 0.8055561780929565, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.772216796875, "rewards/ppl_reward/std": 2.381981372833252, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 99.203125, "completions/mean_terminated_length": 99.203125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.2046908315565032, "grad_norm": 2.557626724243164, "kl": 6.39453125, "learning_rate": 1.8817456178806968e-05, "loss": 0.2412, "num_tokens": 21495741.0, "reward": -1.29541015625, "reward_std": 1.0520939826965332, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.1064453125, "rewards/ppl_reward/std": 4.808581829071045, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19654127955436707, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 100.984375, "completions/mean_terminated_length": 100.984375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.205909229363387, "grad_norm": 2.2109293937683105, "kl": 6.80859375, "learning_rate": 1.8813436809037164e-05, "loss": 0.3774, "num_tokens": 21509028.0, "reward": -0.908203125, "reward_std": 1.1516783237457275, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.29296875, "rewards/ppl_reward/std": 4.662153720855713, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.23302355408668518, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 100.640625, "completions/mean_terminated_length": 100.640625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.2071276271702711, "grad_norm": 3.825432300567627, "kl": 9.45703125, "learning_rate": 1.8809411050864694e-05, "loss": 0.521, "num_tokens": 21522725.0, "reward": -0.912109375, "reward_std": 1.4919917583465576, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.28515625, "rewards/ppl_reward/std": 3.098011016845703, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.26954248547554016, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 86.21875, "completions/mean_terminated_length": 86.21875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 1.2083460249771552, "grad_norm": 1.711021900177002, "kl": 3.5048828125, "learning_rate": 1.880537890720763e-05, "loss": 0.1264, "num_tokens": 21534779.0, "reward": -1.710205078125, "reward_std": 1.748624324798584, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.06884765625, "rewards/ppl_reward/std": 7.883073806762695, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1953943371772766, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 99.703125, "completions/mean_terminated_length": 99.703125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.209564422784039, "grad_norm": 1.800609827041626, "kl": 3.978515625, "learning_rate": 1.880134038098866e-05, "loss": 0.2229, "num_tokens": 21548688.0, "reward": -1.21356201171875, "reward_std": 0.8035619258880615, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.1927490234375, "rewards/ppl_reward/std": 4.72216272354126, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.20638974010944366, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 96.84375, "completions/mean_terminated_length": 96.84375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.210782820590923, "grad_norm": 2.67280912399292, "kl": 8.0390625, "learning_rate": 1.87972954751351e-05, "loss": 0.4179, "num_tokens": 21561454.0, "reward": -0.5205078125, "reward_std": 1.196666955947876, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.642578125, "rewards/ppl_reward/std": 3.0989084243774414, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.15900352597236633, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 90.578125, "completions/mean_terminated_length": 90.578125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 1.2120012183978068, "grad_norm": 4.04115104675293, "kl": 6.712890625, "learning_rate": 1.8793244192578895e-05, "loss": 0.2471, "num_tokens": 21573659.0, "reward": -1.5191650390625, "reward_std": 0.7409985065460205, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.718017578125, "rewards/ppl_reward/std": 4.291399955749512, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1677328199148178, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 102.875, "completions/mean_terminated_length": 102.875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.2132196162046909, "grad_norm": 5.486792087554932, "kl": 7.41796875, "learning_rate": 1.878918653625661e-05, "loss": 0.2875, "num_tokens": 21588043.0, "reward": -8.4957275390625, "reward_std": 9.033602714538574, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -20.624267578125, "rewards/ppl_reward/std": 57.30815505981445, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.23007801175117493, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 103.546875, "completions/mean_terminated_length": 103.546875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.2144380140115747, "grad_norm": 7.10839319229126, "kl": 10.0234375, "learning_rate": 1.8785122509109425e-05, "loss": 0.5947, "num_tokens": 21601294.0, "reward": -0.4400634765625, "reward_std": 0.5235269069671631, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.598876953125, "rewards/ppl_reward/std": 2.4269638061523438, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 87.953125, "completions/mean_terminated_length": 87.953125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.2156564118184587, "grad_norm": 2.238770008087158, "kl": 4.3369140625, "learning_rate": 1.878105211408315e-05, "loss": 0.2067, "num_tokens": 21613323.0, "reward": -2.7828369140625, "reward_std": 0.8715656995773315, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -9.300048828125, "rewards/ppl_reward/std": 5.679279327392578, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 95.890625, "completions/mean_terminated_length": 95.890625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 1.2168748096253426, "grad_norm": 2.577430009841919, "kl": 5.3203125, "learning_rate": 1.8776975354128193e-05, "loss": 0.2966, "num_tokens": 21626236.0, "reward": -1.13330078125, "reward_std": 1.3547126054763794, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.9384765625, "rewards/ppl_reward/std": 5.78505277633667, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21578919887542725, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 104.65625, "completions/mean_terminated_length": 104.65625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.2180932074322266, "grad_norm": 4.241823673248291, "kl": 7.63671875, "learning_rate": 1.8772892232199594e-05, "loss": 0.3317, "num_tokens": 21639782.0, "reward": -2.800048828125, "reward_std": 0.9698259830474854, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -9.16259765625, "rewards/ppl_reward/std": 7.060941219329834, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.21764887869358063, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 117.265625, "completions/mean_terminated_length": 117.265625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 1.2193116052391106, "grad_norm": 3.041132688522339, "kl": 7.783203125, "learning_rate": 1.876880275125699e-05, "loss": 0.4066, "num_tokens": 21655095.0, "reward": -1.104248046875, "reward_std": 0.9275705814361572, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.84130859375, "rewards/ppl_reward/std": 3.228414535522461, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18225978314876556, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 99.984375, "completions/mean_terminated_length": 99.984375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.2205300030459945, "grad_norm": 2.296255350112915, "kl": 6.140625, "learning_rate": 1.8764706914264636e-05, "loss": 0.2501, "num_tokens": 21667998.0, "reward": -0.6842041015625, "reward_std": 0.4461081624031067, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.087158203125, "rewards/ppl_reward/std": 1.830468773841858, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17747680842876434, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 94.4375, "completions/mean_terminated_length": 94.4375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.2217484008528785, "grad_norm": 2.432368516921997, "kl": 3.8955078125, "learning_rate": 1.8760604724191397e-05, "loss": 0.0847, "num_tokens": 21680522.0, "reward": -0.3455810546875, "reward_std": 0.5339393615722656, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.441162109375, "rewards/ppl_reward/std": 2.2107629776000977, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 107.09375, "completions/mean_terminated_length": 107.09375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.2229667986597623, "grad_norm": 3.031420946121216, "kl": 5.046875, "learning_rate": 1.875649618401073e-05, "loss": 0.2881, "num_tokens": 21694304.0, "reward": -4.861328125, "reward_std": 0.7790601253509521, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -13.37109375, "rewards/ppl_reward/std": 10.784232139587402, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 119.625, "completions/mean_terminated_length": 119.625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 1.2241851964666464, "grad_norm": 2.2278096675872803, "kl": 5.783203125, "learning_rate": 1.8752381296700705e-05, "loss": 0.2465, "num_tokens": 21709664.0, "reward": -1.510009765625, "reward_std": 1.8448063135147095, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.64501953125, "rewards/ppl_reward/std": 6.6938323974609375, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19416078925132751, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 111.25, "completions/mean_terminated_length": 111.25, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.2254035942735304, "grad_norm": 2.801065444946289, "kl": 4.509765625, "learning_rate": 1.8748260065243985e-05, "loss": 0.2153, "num_tokens": 21723840.0, "reward": -2.561279296875, "reward_std": 1.0388646125793457, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.72412109375, "rewards/ppl_reward/std": 4.413554668426514, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16512493789196014, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 116.8125, "completions/mean_terminated_length": 116.8125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.2266219920804142, "grad_norm": 2.3865151405334473, "kl": 4.80859375, "learning_rate": 1.8744132492627843e-05, "loss": 0.1964, "num_tokens": 21738028.0, "reward": -1.218994140625, "reward_std": 0.5354308485984802, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.07861328125, "rewards/ppl_reward/std": 2.758913993835449, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13524486124515533, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 119.28125, "completions/mean_terminated_length": 119.28125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.2278403898872983, "grad_norm": 2.398960828781128, "kl": 8.296875, "learning_rate": 1.8739998581844145e-05, "loss": 0.389, "num_tokens": 21752478.0, "reward": -1.2413330078125, "reward_std": 1.3680424690246582, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -5.787353515625, "rewards/ppl_reward/std": 2.9045276641845703, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.22980836033821106, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 134.546875, "completions/mean_terminated_length": 134.546875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.229058787694182, "grad_norm": 3.020543098449707, "kl": 8.640625, "learning_rate": 1.8735858335889343e-05, "loss": 0.3961, "num_tokens": 21769545.0, "reward": -1.2606201171875, "reward_std": 0.9568136930465698, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.943115234375, "rewards/ppl_reward/std": 3.879594087600708, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.21921011805534363, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 123.40625, "completions/mean_terminated_length": 123.40625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 1.2302771855010661, "grad_norm": 2.6622540950775146, "kl": 5.16015625, "learning_rate": 1.8731711757764483e-05, "loss": 0.1433, "num_tokens": 21784323.0, "reward": -0.74951171875, "reward_std": 0.9716695547103882, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.0615234375, "rewards/ppl_reward/std": 3.5086488723754883, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.116794154047966, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 110.40625, "completions/mean_terminated_length": 110.40625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.2314955833079502, "grad_norm": 3.4743080139160156, "kl": 4.212890625, "learning_rate": 1.8727558850475213e-05, "loss": 0.0354, "num_tokens": 21798213.0, "reward": -3.8779296875, "reward_std": 1.6148498058319092, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -11.248046875, "rewards/ppl_reward/std": 8.401679992675781, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2229611724615097, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 121.1875, "completions/mean_terminated_length": 121.1875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.232713981114834, "grad_norm": 1.7973357439041138, "kl": 4.337890625, "learning_rate": 1.8723399617031754e-05, "loss": 0.1986, "num_tokens": 21812481.0, "reward": -1.3251953125, "reward_std": 0.9926040768623352, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.306640625, "rewards/ppl_reward/std": 4.244632720947266, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1326993852853775, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 128.03125, "completions/mean_terminated_length": 128.03125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.233932378921718, "grad_norm": 3.0071463584899902, "kl": 3.0556640625, "learning_rate": 1.8719234060448914e-05, "loss": 0.1446, "num_tokens": 21827763.0, "reward": -0.8985595703125, "reward_std": 0.5673414468765259, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.523681640625, "rewards/ppl_reward/std": 3.2113046646118164, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.2351507767286019, "grad_norm": 2.0597457885742188, "kl": 5.013671875, "learning_rate": 1.87150621837461e-05, "loss": 0.223, "num_tokens": 21842363.0, "reward": -3.259521484375, "reward_std": 1.0679181814193726, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -10.18310546875, "rewards/ppl_reward/std": 7.381554126739502, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 121.125, "completions/mean_terminated_length": 121.125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.236369174535486, "grad_norm": 20.163881301879883, "kl": 18.046875, "learning_rate": 1.8710883989947278e-05, "loss": 0.6799, "num_tokens": 21856819.0, "reward": -3.231689453125, "reward_std": 1.4328473806381226, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.48795005679130554, "rewards/ppl_reward/mean": -8.83837890625, "rewards/ppl_reward/std": 4.589975833892822, "rewards/tag_count_reward/mean": 0.8125, "rewards/tag_count_reward/std": 0.22271771728992462, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 133.546875, "completions/mean_terminated_length": 133.546875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.2375875723423697, "grad_norm": 22.353504180908203, "kl": 21.75, "learning_rate": 1.8706699482081015e-05, "loss": 0.8497, "num_tokens": 21872414.0, "reward": -1.3887939453125, "reward_std": 1.120243787765503, "rewards/format_reward/mean": 0.390625, "rewards/format_reward/std": 0.4917473793029785, "rewards/ppl_reward/mean": -5.105712890625, "rewards/ppl_reward/std": 2.5782322883605957, "rewards/tag_count_reward/mean": 0.7734375, "rewards/tag_count_reward/std": 0.2387082874774933, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 117.6875, "completions/mean_terminated_length": 117.6875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.2388059701492538, "grad_norm": 9.776307106018066, "kl": 11.1328125, "learning_rate": 1.8702508663180432e-05, "loss": 0.4122, "num_tokens": 21886850.0, "reward": -1.937255859375, "reward_std": 1.4703201055526733, "rewards/format_reward/mean": 0.59375, "rewards/format_reward/std": 0.49501484632492065, "rewards/ppl_reward/mean": -6.68701171875, "rewards/ppl_reward/std": 3.027069330215454, "rewards/tag_count_reward/mean": 0.8125, "rewards/tag_count_reward/std": 0.29546841979026794, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 120.171875, "completions/mean_terminated_length": 120.171875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.2400243679561376, "grad_norm": 2.145014524459839, "kl": 4.724609375, "learning_rate": 1.8698311536283244e-05, "loss": 0.1784, "num_tokens": 21900941.0, "reward": -0.79119873046875, "reward_std": 0.7132036685943604, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.2308349609375, "rewards/ppl_reward/std": 2.8373448848724365, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17354662716388702, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 125.203125, "completions/mean_terminated_length": 125.203125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.2412427657630216, "grad_norm": 2.900542974472046, "kl": 2.14453125, "learning_rate": 1.8694108104431725e-05, "loss": 0.0306, "num_tokens": 21915954.0, "reward": -1.2493896484375, "reward_std": 0.7321663498878479, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.240966796875, "rewards/ppl_reward/std": 2.230264902114868, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.20009763538837433, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 129.90625, "completions/mean_terminated_length": 129.90625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.2424611635699057, "grad_norm": 3.0032944679260254, "kl": 1.822265625, "learning_rate": 1.8689898370672727e-05, "loss": 0.0123, "num_tokens": 21931396.0, "reward": -0.63690185546875, "reward_std": 0.43483754992485046, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.1566162109375, "rewards/ppl_reward/std": 2.7387185096740723, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 140.828125, "completions/mean_terminated_length": 140.828125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 1.2436795613767895, "grad_norm": 1.850996494293213, "kl": 1.876953125, "learning_rate": 1.8685682338057667e-05, "loss": 0.0965, "num_tokens": 21947625.0, "reward": -1.191162109375, "reward_std": 0.7681655883789062, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.28076171875, "rewards/ppl_reward/std": 4.812075138092041, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 136.109375, "completions/mean_terminated_length": 136.109375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.2448979591836735, "grad_norm": 1.8894782066345215, "kl": 1.427734375, "learning_rate": 1.8681460009642533e-05, "loss": -0.0351, "num_tokens": 21964112.0, "reward": -1.47076416015625, "reward_std": 0.23350894451141357, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.8868408203125, "rewards/ppl_reward/std": 5.303915500640869, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 136.140625, "completions/mean_terminated_length": 136.140625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 1.2461163569905573, "grad_norm": 2.092665672302246, "kl": 1.84375, "learning_rate": 1.867723138848786e-05, "loss": 0.065, "num_tokens": 21979793.0, "reward": -1.8251953125, "reward_std": 0.8867665529251099, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.486328125, "rewards/ppl_reward/std": 6.759219646453857, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.16587424278259277, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 128.59375, "completions/mean_terminated_length": 128.59375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.2473347547974414, "grad_norm": 1.8694770336151123, "kl": 2.2373046875, "learning_rate": 1.8672996477658767e-05, "loss": 0.0349, "num_tokens": 21995439.0, "reward": -1.954345703125, "reward_std": 0.6368165016174316, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.56494140625, "rewards/ppl_reward/std": 7.897489547729492, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 123.921875, "completions/mean_terminated_length": 123.921875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.2485531526043254, "grad_norm": 2.3878262042999268, "kl": 3.09375, "learning_rate": 1.8668755280224916e-05, "loss": -0.0048, "num_tokens": 22009882.0, "reward": -1.416015625, "reward_std": 1.230499505996704, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.35546875, "rewards/ppl_reward/std": 3.548065185546875, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2229611724615097, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 141.9375, "completions/mean_terminated_length": 141.9375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.2497715504112092, "grad_norm": 1.5008357763290405, "kl": 3.6328125, "learning_rate": 1.8664507799260526e-05, "loss": 0.1271, "num_tokens": 22026582.0, "reward": -0.64044189453125, "reward_std": 0.5243905782699585, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.8590087890625, "rewards/ppl_reward/std": 3.0700924396514893, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.23239074647426605, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 144.125, "completions/mean_terminated_length": 144.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 1.2509899482180933, "grad_norm": 2.1735596656799316, "kl": 4.6474609375, "learning_rate": 1.866025403784439e-05, "loss": 0.157, "num_tokens": 22043622.0, "reward": -0.1817626953125, "reward_std": 0.7185294032096863, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -3.918212890625, "rewards/ppl_reward/std": 1.2616727352142334, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.248226597905159, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 124.53125, "completions/mean_terminated_length": 124.53125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 1.252208346024977, "grad_norm": 1.8907681703567505, "kl": 2.7705078125, "learning_rate": 1.8655993999059823e-05, "loss": 0.0373, "num_tokens": 22058344.0, "reward": -3.29071044921875, "reward_std": 3.803779363632202, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -10.2767333984375, "rewards/ppl_reward/std": 21.38338851928711, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 133.125, "completions/mean_terminated_length": 133.125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.2534267438318611, "grad_norm": 2.7690000534057617, "kl": 4.25390625, "learning_rate": 1.8651727685994715e-05, "loss": 0.1111, "num_tokens": 22074072.0, "reward": -0.7822265625, "reward_std": 1.1093393564224243, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.134765625, "rewards/ppl_reward/std": 2.4151532649993896, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2121305763721466, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 144.359375, "completions/mean_terminated_length": 144.359375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 1.2546451416387452, "grad_norm": 6.728281021118164, "kl": 11.234375, "learning_rate": 1.8647455101741492e-05, "loss": 0.465, "num_tokens": 22090367.0, "reward": -2.797119140625, "reward_std": 1.638880729675293, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4787135720252991, "rewards/ppl_reward/mean": -8.59423828125, "rewards/ppl_reward/std": 4.1153669357299805, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.25, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 140.453125, "completions/mean_terminated_length": 140.453125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.255863539445629, "grad_norm": 20.353391647338867, "kl": 13.75, "learning_rate": 1.8643176249397127e-05, "loss": 0.442, "num_tokens": 22107252.0, "reward": -3.5758056640625, "reward_std": 1.1468690633773804, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.5, "rewards/ppl_reward/mean": -9.596923828125, "rewards/ppl_reward/std": 12.701628684997559, "rewards/tag_count_reward/mean": 0.78515625, "rewards/tag_count_reward/std": 0.26676779985427856, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 129.171875, "completions/mean_terminated_length": 129.171875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.2570819372525128, "grad_norm": 20.737388610839844, "kl": 11.234375, "learning_rate": 1.863889113206314e-05, "loss": 0.3863, "num_tokens": 22122311.0, "reward": -1.5029296875, "reward_std": 0.7958639860153198, "rewards/format_reward/mean": 0.484375, "rewards/format_reward/std": 0.5037065148353577, "rewards/ppl_reward/mean": -5.630859375, "rewards/ppl_reward/std": 2.6482057571411133, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.21764887869358063, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 129.4375, "completions/mean_terminated_length": 129.4375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 1.2583003350593969, "grad_norm": 5.57792854309082, "kl": 9.1015625, "learning_rate": 1.8634599752845594e-05, "loss": 0.2871, "num_tokens": 22138027.0, "reward": -1.6619873046875, "reward_std": 1.2380576133728027, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.48795005679130554, "rewards/ppl_reward/mean": -6.222412109375, "rewards/ppl_reward/std": 3.73533296585083, "rewards/tag_count_reward/mean": 0.82421875, "rewards/tag_count_reward/std": 0.2875722348690033, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 130.390625, "completions/mean_terminated_length": 130.390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.259518732866281, "grad_norm": 1.9586679935455322, "kl": 5.1875, "learning_rate": 1.8630302114855078e-05, "loss": 0.1043, "num_tokens": 22153276.0, "reward": -1.6480712890625, "reward_std": 1.36411714553833, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -6.514892578125, "rewards/ppl_reward/std": 3.2644808292388916, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2221602201461792, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 136.390625, "completions/mean_terminated_length": 136.390625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 1.2607371306731647, "grad_norm": 2.140437364578247, "kl": 3.6171875, "learning_rate": 1.8625998221206732e-05, "loss": 0.0543, "num_tokens": 22169333.0, "reward": -1.1341552734375, "reward_std": 0.783930778503418, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -5.596435546875, "rewards/ppl_reward/std": 2.18814754486084, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.17938801646232605, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 130.484375, "completions/mean_terminated_length": 130.484375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.2619555284800488, "grad_norm": 2.288020372390747, "kl": 3.9140625, "learning_rate": 1.8621688075020226e-05, "loss": 0.1607, "num_tokens": 22185164.0, "reward": -2.337158203125, "reward_std": 1.033594012260437, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -8.19775390625, "rewards/ppl_reward/std": 6.398834705352783, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.14211894571781158, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 140.375, "completions/mean_terminated_length": 140.375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.2631739262869326, "grad_norm": 2.1340861320495605, "kl": 5.9140625, "learning_rate": 1.8617371679419757e-05, "loss": 0.2079, "num_tokens": 22201124.0, "reward": -2.1256103515625, "reward_std": 0.7836261987686157, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -7.673095703125, "rewards/ppl_reward/std": 5.609812259674072, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 128.234375, "completions/mean_terminated_length": 128.234375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.2643923240938166, "grad_norm": 1.6273927688598633, "kl": 2.3837890625, "learning_rate": 1.861304903753406e-05, "loss": 0.0533, "num_tokens": 22216899.0, "reward": -0.59844970703125, "reward_std": 0.3428311347961426, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.0250244140625, "rewards/ppl_reward/std": 2.2483978271484375, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 125.0, "completions/mean_terminated_length": 125.0, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 1.2656107219007007, "grad_norm": 3.8726634979248047, "kl": 3.859375, "learning_rate": 1.860872015249639e-05, "loss": 0.0479, "num_tokens": 22231699.0, "reward": -2.03515625, "reward_std": 0.45041730999946594, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.8046875, "rewards/ppl_reward/std": 5.6345744132995605, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07344620674848557, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 143.0625, "completions/mean_terminated_length": 143.0625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 1.2668291197075845, "grad_norm": 2.291656255722046, "kl": 2.146484375, "learning_rate": 1.8604385027444535e-05, "loss": 0.11, "num_tokens": 22248663.0, "reward": -0.80291748046875, "reward_std": 0.25831276178359985, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.4573974609375, "rewards/ppl_reward/std": 4.107797622680664, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 142.21875, "completions/mean_terminated_length": 142.21875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 1.2680475175144685, "grad_norm": 2.433729648590088, "kl": 2.31640625, "learning_rate": 1.8600043665520803e-05, "loss": 0.1465, "num_tokens": 22265213.0, "reward": -0.9356689453125, "reward_std": 0.48032504320144653, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.551025390625, "rewards/ppl_reward/std": 2.639531135559082, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1249379813671112, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 129.890625, "completions/mean_terminated_length": 129.890625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.2692659153213524, "grad_norm": 2.234642267227173, "kl": 4.818359375, "learning_rate": 1.8595696069872013e-05, "loss": 0.2127, "num_tokens": 22280110.0, "reward": -2.361083984375, "reward_std": 1.2938761711120605, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.18310546875, "rewards/ppl_reward/std": 4.923771381378174, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.2582649290561676, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 143.359375, "completions/mean_terminated_length": 143.359375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.2704843131282364, "grad_norm": 1.995465636253357, "kl": 7.19140625, "learning_rate": 1.859134224364952e-05, "loss": 0.3894, "num_tokens": 22296429.0, "reward": -0.947998046875, "reward_std": 1.0281555652618408, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.39599609375, "rewards/ppl_reward/std": 3.578706741333008, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2083333432674408, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 136.078125, "completions/mean_terminated_length": 136.078125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.2717027109351204, "grad_norm": 3.401883125305176, "kl": 7.029296875, "learning_rate": 1.8586982190009183e-05, "loss": 0.3024, "num_tokens": 22312586.0, "reward": -0.67919921875, "reward_std": 0.6275047063827515, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -4.6396484375, "rewards/ppl_reward/std": 2.208343744277954, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.23935678601264954, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 127.25, "completions/mean_terminated_length": 127.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.2729211087420043, "grad_norm": 3.4084200859069824, "kl": 6.66015625, "learning_rate": 1.8582615912111378e-05, "loss": 0.3434, "num_tokens": 22327138.0, "reward": -1.634033203125, "reward_std": 1.1375155448913574, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.71337890625, "rewards/ppl_reward/std": 6.084864616394043, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.224347323179245, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 130.046875, "completions/mean_terminated_length": 130.046875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.274139506548888, "grad_norm": 2.257523536682129, "kl": 6.46484375, "learning_rate": 1.8578243413120996e-05, "loss": 0.263, "num_tokens": 22342181.0, "reward": -1.9832763671875, "reward_std": 1.7189486026763916, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.411865234375, "rewards/ppl_reward/std": 4.05509614944458, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2372427135705948, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 115.390625, "completions/mean_terminated_length": 115.390625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.2753579043557721, "grad_norm": 2.3391196727752686, "kl": 4.40234375, "learning_rate": 1.857386469620743e-05, "loss": 0.2324, "num_tokens": 22356334.0, "reward": -0.15234375, "reward_std": 0.7113230228424072, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -3.9140625, "rewards/ppl_reward/std": 1.7469968795776367, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1508491188287735, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 114.34375, "completions/mean_terminated_length": 114.34375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.2765763021626562, "grad_norm": 4.505651473999023, "kl": 9.15625, "learning_rate": 1.8569479764544588e-05, "loss": 0.43, "num_tokens": 22370580.0, "reward": -2.3001708984375, "reward_std": 1.158647060394287, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.123779296875, "rewards/ppl_reward/std": 5.860220432281494, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1953943371772766, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 108.375, "completions/mean_terminated_length": 108.375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.27779469996954, "grad_norm": 3.1064789295196533, "kl": 5.7734375, "learning_rate": 1.856508862131088e-05, "loss": 0.2556, "num_tokens": 22384244.0, "reward": -0.8494873046875, "reward_std": 0.7367959022521973, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.167724609375, "rewards/ppl_reward/std": 2.9205985069274902, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20351573824882507, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 112.703125, "completions/mean_terminated_length": 112.703125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 1.279013097776424, "grad_norm": 5.671927452087402, "kl": 9.9765625, "learning_rate": 1.8560691269689214e-05, "loss": 0.4195, "num_tokens": 22398161.0, "reward": -8.0166015625, "reward_std": 4.258245944976807, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -19.314453125, "rewards/ppl_reward/std": 39.84765625, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.19158901274204254, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 102.90625, "completions/mean_terminated_length": 102.90625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 1.2802314955833078, "grad_norm": 3.1716840267181396, "kl": 7.09765625, "learning_rate": 1.8556287712867006e-05, "loss": 0.2427, "num_tokens": 22411315.0, "reward": -1.074951171875, "reward_std": 1.00384521484375, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.53271484375, "rewards/ppl_reward/std": 2.779773712158203, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2287265807390213, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 97.4375, "completions/mean_terminated_length": 97.4375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.2814498933901919, "grad_norm": 3.028640031814575, "kl": 4.69140625, "learning_rate": 1.8551877954036165e-05, "loss": 0.1987, "num_tokens": 22424295.0, "reward": -3.70556640625, "reward_std": 1.589369773864746, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -11.0283203125, "rewards/ppl_reward/std": 12.439956665039062, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10789459943771362, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 111.78125, "completions/mean_terminated_length": 111.78125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.282668291197076, "grad_norm": 3.0835697650909424, "kl": 3.568359375, "learning_rate": 1.8547461996393094e-05, "loss": 0.1735, "num_tokens": 22438897.0, "reward": -0.3101806640625, "reward_std": 0.6847276091575623, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.214111328125, "rewards/ppl_reward/std": 1.6335002183914185, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1836577206850052, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 108.953125, "completions/mean_terminated_length": 108.953125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.2838866890039597, "grad_norm": 1.998755693435669, "kl": 4.6953125, "learning_rate": 1.8543039843138688e-05, "loss": 0.1609, "num_tokens": 22453118.0, "reward": -1.5048828125, "reward_std": 0.6043558120727539, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.650390625, "rewards/ppl_reward/std": 4.020586967468262, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.2203386276960373, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 97.75, "completions/mean_terminated_length": 97.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 1.2851050868108438, "grad_norm": 2.5717220306396484, "kl": 4.703125, "learning_rate": 1.8538611497478343e-05, "loss": 0.1102, "num_tokens": 22466558.0, "reward": -1.172607421875, "reward_std": 1.2883269786834717, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.91552734375, "rewards/ppl_reward/std": 3.7734570503234863, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2759082615375519, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 91.796875, "completions/mean_terminated_length": 91.796875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 1.2863234846177276, "grad_norm": 1.8206390142440796, "kl": 4.75390625, "learning_rate": 1.8534176962621934e-05, "loss": 0.19, "num_tokens": 22478793.0, "reward": -2.2943115234375, "reward_std": 0.6224631667137146, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.158935546875, "rewards/ppl_reward/std": 4.6103739738464355, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 95.796875, "completions/mean_terminated_length": 95.796875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 1.2875418824246117, "grad_norm": 4.141596794128418, "kl": 6.3046875, "learning_rate": 1.8529736241783825e-05, "loss": 0.3527, "num_tokens": 22491612.0, "reward": -3.2803955078125, "reward_std": 3.646973133087158, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -10.045166015625, "rewards/ppl_reward/std": 15.785356521606445, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.28770697116851807, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 104.46875, "completions/mean_terminated_length": 104.46875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.2887602802314957, "grad_norm": 2.0916011333465576, "kl": 2.91015625, "learning_rate": 1.8525289338182865e-05, "loss": 0.0655, "num_tokens": 22505394.0, "reward": -1.700439453125, "reward_std": 0.926301121711731, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.11181640625, "rewards/ppl_reward/std": 3.2288923263549805, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17354662716388702, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 108.4375, "completions/mean_terminated_length": 108.4375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.2899786780383795, "grad_norm": 2.409064531326294, "kl": 3.599609375, "learning_rate": 1.8520836255042382e-05, "loss": 0.1325, "num_tokens": 22519262.0, "reward": -1.304443359375, "reward_std": 0.8771241903305054, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.24951171875, "rewards/ppl_reward/std": 3.8526782989501953, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 116.046875, "completions/mean_terminated_length": 116.046875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 1.2911970758452636, "grad_norm": 2.8502352237701416, "kl": 4.07421875, "learning_rate": 1.8516376995590185e-05, "loss": 0.1665, "num_tokens": 22534121.0, "reward": -3.0089111328125, "reward_std": 1.7147375345230103, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -9.635009765625, "rewards/ppl_reward/std": 13.309850692749023, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21007457375526428, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 121.53125, "completions/mean_terminated_length": 121.53125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.2924154736521474, "grad_norm": 1.7763224840164185, "kl": 3.8916015625, "learning_rate": 1.8511911563058563e-05, "loss": 0.1429, "num_tokens": 22549651.0, "reward": -0.6131591796875, "reward_std": 0.8678678274154663, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.812255859375, "rewards/ppl_reward/std": 2.3996129035949707, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21007457375526428, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 121.953125, "completions/mean_terminated_length": 121.953125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.2936338714590314, "grad_norm": 3.581902503967285, "kl": 5.15234375, "learning_rate": 1.850743996068427e-05, "loss": 0.2308, "num_tokens": 22564840.0, "reward": -0.456787109375, "reward_std": 0.5781767964363098, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.52294921875, "rewards/ppl_reward/std": 1.9485212564468384, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.12962667644023895, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 99.78125, "completions/mean_terminated_length": 99.78125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 1.2948522692659155, "grad_norm": 4.013078689575195, "kl": 6.578125, "learning_rate": 1.850296219170854e-05, "loss": 0.1746, "num_tokens": 22577706.0, "reward": -4.5482177734375, "reward_std": 4.01469612121582, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -12.471435546875, "rewards/ppl_reward/std": 16.993488311767578, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2630521357059479, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 112.84375, "completions/mean_terminated_length": 112.84375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 1.2960706670727993, "grad_norm": 2.956878423690796, "kl": 7.171875, "learning_rate": 1.8498478259377083e-05, "loss": 0.2888, "num_tokens": 22591496.0, "reward": -1.4449462890625, "reward_std": 1.0911545753479004, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -6.147705078125, "rewards/ppl_reward/std": 2.7734124660491943, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.23061636090278625, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 109.65625, "completions/mean_terminated_length": 109.65625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 1.297289064879683, "grad_norm": 2.4933934211730957, "kl": 7.5625, "learning_rate": 1.8493988166940057e-05, "loss": 0.2688, "num_tokens": 22604962.0, "reward": -1.335693359375, "reward_std": 1.6292734146118164, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -5.85888671875, "rewards/ppl_reward/std": 3.5931925773620605, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2597312331199646, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 127.5, "completions/mean_terminated_length": 127.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.2985074626865671, "grad_norm": 4.202901363372803, "kl": 9.71875, "learning_rate": 1.8489491917652102e-05, "loss": 0.4053, "num_tokens": 22620282.0, "reward": -1.87353515625, "reward_std": 1.2488292455673218, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -7.0595703125, "rewards/ppl_reward/std": 3.430556297302246, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.25539806485176086, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 120.890625, "completions/mean_terminated_length": 120.890625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.2997258604934512, "grad_norm": 3.290055990219116, "kl": 8.7734375, "learning_rate": 1.848498951477232e-05, "loss": 0.3359, "num_tokens": 22635755.0, "reward": -1.77239990234375, "reward_std": 1.6534240245819092, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -6.7557373046875, "rewards/ppl_reward/std": 5.419602870941162, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.20378214120864868, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 122.578125, "completions/mean_terminated_length": 122.578125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.300944258300335, "grad_norm": 5.46691370010376, "kl": 10.8359375, "learning_rate": 1.848048096156426e-05, "loss": 0.4208, "num_tokens": 22650552.0, "reward": -1.145751953125, "reward_std": 1.2284045219421387, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -5.47119140625, "rewards/ppl_reward/std": 2.5031235218048096, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.26349374651908875, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 130.28125, "completions/mean_terminated_length": 130.28125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.302162656107219, "grad_norm": 2.8683834075927734, "kl": 4.2734375, "learning_rate": 1.8475966261295947e-05, "loss": 0.1942, "num_tokens": 22666698.0, "reward": -2.3909912109375, "reward_std": 0.6948646306991577, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.352294921875, "rewards/ppl_reward/std": 4.196111679077148, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1224314495921135, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 112.0, "completions/mean_terminated_length": 112.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 1.3033810539141029, "grad_norm": 2.176603078842163, "kl": 5.677734375, "learning_rate": 1.8471445417239845e-05, "loss": 0.1915, "num_tokens": 22681058.0, "reward": -2.109619140625, "reward_std": 1.1779026985168457, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -7.62548828125, "rewards/ppl_reward/std": 4.575776100158691, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20351573824882507, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 133.234375, "completions/mean_terminated_length": 133.234375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.304599451720987, "grad_norm": 2.165701389312744, "kl": 4.67578125, "learning_rate": 1.8466918432672886e-05, "loss": 0.1683, "num_tokens": 22697073.0, "reward": -0.3438720703125, "reward_std": 0.728156566619873, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -4.133056640625, "rewards/ppl_reward/std": 1.311460256576538, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.19760315120220184, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 110.375, "completions/mean_terminated_length": 110.375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.305817849527871, "grad_norm": 3.060096502304077, "kl": 3.25, "learning_rate": 1.8462385310876444e-05, "loss": 0.0983, "num_tokens": 22711161.0, "reward": -1.08251953125, "reward_std": 0.5630360841751099, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.8369140625, "rewards/ppl_reward/std": 2.4361910820007324, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 121.09375, "completions/mean_terminated_length": 121.09375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.3070362473347548, "grad_norm": 4.060234069824219, "kl": 4.48828125, "learning_rate": 1.8457846055136336e-05, "loss": 0.1687, "num_tokens": 22726079.0, "reward": -1.3646240234375, "reward_std": 0.7952542304992676, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.307373046875, "rewards/ppl_reward/std": 4.059207916259766, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15728822350502014, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 116.171875, "completions/mean_terminated_length": 116.171875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 1.3082546451416388, "grad_norm": 1.9020951986312866, "kl": 4.42578125, "learning_rate": 1.8453300668742847e-05, "loss": 0.1371, "num_tokens": 22740322.0, "reward": -1.578125, "reward_std": 1.1774389743804932, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.6875, "rewards/ppl_reward/std": 3.081063985824585, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17251639068126678, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 132.21875, "completions/mean_terminated_length": 132.21875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.3094730429485226, "grad_norm": 2.7140960693359375, "kl": 3.451171875, "learning_rate": 1.8448749154990677e-05, "loss": 0.0655, "num_tokens": 22756848.0, "reward": -1.7596435546875, "reward_std": 1.3232976198196411, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.073974609375, "rewards/ppl_reward/std": 5.271219730377197, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19024936854839325, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 134.109375, "completions/mean_terminated_length": 134.109375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.3106914407554067, "grad_norm": 2.7380154132843018, "kl": 6.6015625, "learning_rate": 1.8444191517178992e-05, "loss": 0.2787, "num_tokens": 22772927.0, "reward": -1.1810302734375, "reward_std": 0.9508323669433594, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.737060546875, "rewards/ppl_reward/std": 4.20623779296875, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.23779743909835815, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 132.984375, "completions/mean_terminated_length": 132.984375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.3119098385622907, "grad_norm": 2.5640926361083984, "kl": 6.13671875, "learning_rate": 1.8439627758611384e-05, "loss": 0.2791, "num_tokens": 22788526.0, "reward": -2.275146484375, "reward_std": 1.033968448638916, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.09716796875, "rewards/ppl_reward/std": 5.295090675354004, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19654127955436707, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 130.0625, "completions/mean_terminated_length": 130.0625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.3131282363691745, "grad_norm": 2.352900743484497, "kl": 6.6328125, "learning_rate": 1.8435057882595885e-05, "loss": 0.2341, "num_tokens": 22804074.0, "reward": -0.9609375, "reward_std": 1.445373773574829, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -5.2890625, "rewards/ppl_reward/std": 4.443073272705078, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.1994769275188446, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 134.75, "completions/mean_terminated_length": 134.75, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.3143466341760583, "grad_norm": 2.0357043743133545, "kl": 5.77734375, "learning_rate": 1.843048189244496e-05, "loss": 0.2299, "num_tokens": 22819978.0, "reward": -0.5772705078125, "reward_std": 0.6549558639526367, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.615478515625, "rewards/ppl_reward/std": 1.9930660724639893, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.19444002211093903, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 131.890625, "completions/mean_terminated_length": 131.890625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.3155650319829424, "grad_norm": 2.7554430961608887, "kl": 5.6953125, "learning_rate": 1.8425899791475504e-05, "loss": 0.2882, "num_tokens": 22835563.0, "reward": -1.4417724609375, "reward_std": 0.8084083795547485, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.524169921875, "rewards/ppl_reward/std": 5.64935302734375, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21578919887542725, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 123.484375, "completions/mean_terminated_length": 123.484375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 1.3167834297898264, "grad_norm": 5.9513444900512695, "kl": 8.640625, "learning_rate": 1.8421311583008847e-05, "loss": 0.3043, "num_tokens": 22849866.0, "reward": -2.1334228515625, "reward_std": 1.878509283065796, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -7.704345703125, "rewards/ppl_reward/std": 3.966691493988037, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1883249133825302, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 122.03125, "completions/mean_terminated_length": 122.03125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.3180018275967103, "grad_norm": 10.279375076293945, "kl": 8.99609375, "learning_rate": 1.8416717270370744e-05, "loss": 0.3345, "num_tokens": 22864060.0, "reward": -1.8917236328125, "reward_std": 1.658187747001648, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.236572265625, "rewards/ppl_reward/std": 6.549492359161377, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.20518454909324646, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 128.75, "completions/mean_terminated_length": 128.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.3192202254035943, "grad_norm": 2.406507968902588, "kl": 3.96875, "learning_rate": 1.8412116856891374e-05, "loss": 0.0641, "num_tokens": 22879196.0, "reward": -1.783203125, "reward_std": 1.373082160949707, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.16796875, "rewards/ppl_reward/std": 5.058445453643799, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.23007801175117493, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 126.265625, "completions/mean_terminated_length": 126.265625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.3204386232104781, "grad_norm": 2.042097568511963, "kl": 1.93359375, "learning_rate": 1.8407510345905332e-05, "loss": 0.0518, "num_tokens": 22893373.0, "reward": -2.0360107421875, "reward_std": 0.38100510835647583, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.931396484375, "rewards/ppl_reward/std": 6.458098888397217, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 132.09375, "completions/mean_terminated_length": 132.09375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 1.3216570210173622, "grad_norm": 3.5784995555877686, "kl": 7.50390625, "learning_rate": 1.840289774075165e-05, "loss": 0.3497, "num_tokens": 22908451.0, "reward": -3.10009765625, "reward_std": 1.4611613750457764, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -9.6923828125, "rewards/ppl_reward/std": 6.413057804107666, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2043897658586502, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 140.109375, "completions/mean_terminated_length": 140.109375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.3228754188242462, "grad_norm": 1.7170838117599487, "kl": 4.236328125, "learning_rate": 1.8398279044773754e-05, "loss": 0.1536, "num_tokens": 22924730.0, "reward": -1.70849609375, "reward_std": 0.6140440702438354, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.0341796875, "rewards/ppl_reward/std": 5.015419006347656, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.14211894571781158, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 136.484375, "completions/mean_terminated_length": 136.484375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.32409381663113, "grad_norm": 1.8468152284622192, "kl": 5.73046875, "learning_rate": 1.8393654261319504e-05, "loss": 0.2715, "num_tokens": 22940529.0, "reward": -4.6490478515625, "reward_std": 0.7667344212532043, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -12.813720703125, "rewards/ppl_reward/std": 19.191299438476562, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.209963858127594, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 139.6875, "completions/mean_terminated_length": 139.6875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.325312214438014, "grad_norm": 1.8979741334915161, "kl": 4.7109375, "learning_rate": 1.8389023393741157e-05, "loss": 0.1741, "num_tokens": 22956501.0, "reward": -2.3173828125, "reward_std": 1.6621426343917847, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.275390625, "rewards/ppl_reward/std": 7.306190013885498, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.14412261545658112, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 144.25, "completions/mean_terminated_length": 144.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.3265306122448979, "grad_norm": 3.1794843673706055, "kl": 5.8828125, "learning_rate": 1.83843864453954e-05, "loss": 0.2237, "num_tokens": 22973309.0, "reward": -1.96142578125, "reward_std": 0.8777135014533997, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.5556640625, "rewards/ppl_reward/std": 4.91906213760376, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.15900352597236633, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 132.859375, "completions/mean_terminated_length": 132.859375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.327749010051782, "grad_norm": 2.758178472518921, "kl": 4.578125, "learning_rate": 1.837974341964331e-05, "loss": 0.2349, "num_tokens": 22988788.0, "reward": -0.47509765625, "reward_std": 0.7465101480484009, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.4736328125, "rewards/ppl_reward/std": 2.064019203186035, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.17283059656620026, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 135.09375, "completions/mean_terminated_length": 135.09375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 1.328967407858666, "grad_norm": 2.684635877609253, "kl": 3.0322265625, "learning_rate": 1.8375094319850374e-05, "loss": 0.1924, "num_tokens": 23004586.0, "reward": -0.473388671875, "reward_std": 0.38530632853507996, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.74365234375, "rewards/ppl_reward/std": 1.7134290933609009, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13524486124515533, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 133.3125, "completions/mean_terminated_length": 133.3125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.3301858056655498, "grad_norm": 1.833729863166809, "kl": 1.865234375, "learning_rate": 1.8370439149386484e-05, "loss": 0.039, "num_tokens": 23020150.0, "reward": -1.7188720703125, "reward_std": 0.5093904137611389, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.258056640625, "rewards/ppl_reward/std": 2.7097153663635254, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 115.25, "completions/mean_terminated_length": 115.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.3314042034724338, "grad_norm": 2.811945676803589, "kl": 4.2265625, "learning_rate": 1.8365777911625938e-05, "loss": 0.1578, "num_tokens": 23033950.0, "reward": -2.19384765625, "reward_std": 1.0686264038085938, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.9970703125, "rewards/ppl_reward/std": 5.506102085113525, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1508491188287735, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 108.953125, "completions/mean_terminated_length": 108.953125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.3326226012793176, "grad_norm": 5.533278465270996, "kl": 6.890625, "learning_rate": 1.8361110609947412e-05, "loss": 0.2132, "num_tokens": 23047619.0, "reward": -3.966552734375, "reward_std": 1.9601490497589111, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -11.48779296875, "rewards/ppl_reward/std": 15.12971019744873, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.16810208559036255, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 114.21875, "completions/mean_terminated_length": 114.21875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 1.3338409990862017, "grad_norm": 4.3502302169799805, "kl": 5.349609375, "learning_rate": 1.8356437247734003e-05, "loss": 0.15, "num_tokens": 23061913.0, "reward": -1.2989501953125, "reward_std": 1.1005504131317139, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.285400390625, "rewards/ppl_reward/std": 5.884933948516846, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 120.5625, "completions/mean_terminated_length": 120.5625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.3350593968930857, "grad_norm": 2.3914215564727783, "kl": 6.890625, "learning_rate": 1.8351757828373183e-05, "loss": 0.3692, "num_tokens": 23076261.0, "reward": -3.0159912109375, "reward_std": 1.984482765197754, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -9.594482421875, "rewards/ppl_reward/std": 10.715400695800781, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1666666716337204, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 118.59375, "completions/mean_terminated_length": 118.59375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.3362777946999695, "grad_norm": 3.8909170627593994, "kl": 3.80078125, "learning_rate": 1.834707235525682e-05, "loss": 0.1294, "num_tokens": 23090835.0, "reward": -1.0924072265625, "reward_std": 0.41285812854766846, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.950439453125, "rewards/ppl_reward/std": 3.160039186477661, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13524486124515533, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 106.984375, "completions/mean_terminated_length": 106.984375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.3374961925068534, "grad_norm": 2.515948534011841, "kl": 6.890625, "learning_rate": 1.834238083178117e-05, "loss": 0.2909, "num_tokens": 23104138.0, "reward": -0.21844482421875, "reward_std": 0.7324986457824707, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -4.0150146484375, "rewards/ppl_reward/std": 1.8335381746292114, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1110801100730896, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 117.390625, "completions/mean_terminated_length": 117.390625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 1.3387145903137374, "grad_norm": 4.097226619720459, "kl": 4.947265625, "learning_rate": 1.8337683261346875e-05, "loss": 0.1481, "num_tokens": 23118635.0, "reward": -1.61474609375, "reward_std": 0.36380645632743835, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.0810546875, "rewards/ppl_reward/std": 6.268994331359863, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 116.296875, "completions/mean_terminated_length": 116.296875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.3399329881206214, "grad_norm": 1.877345323562622, "kl": 3.630859375, "learning_rate": 1.833297964735896e-05, "loss": 0.0927, "num_tokens": 23133574.0, "reward": -1.373046875, "reward_std": 0.41154783964157104, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.53515625, "rewards/ppl_reward/std": 4.603307247161865, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 116.4375, "completions/mean_terminated_length": 116.4375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.3411513859275053, "grad_norm": 1.763567328453064, "kl": 3.0771484375, "learning_rate": 1.8328269993226833e-05, "loss": 0.0918, "num_tokens": 23147818.0, "reward": -2.1943359375, "reward_std": 0.5980134010314941, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.201171875, "rewards/ppl_reward/std": 7.353604793548584, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 108.796875, "completions/mean_terminated_length": 108.796875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.3423697837343893, "grad_norm": 3.0110650062561035, "kl": 5.375, "learning_rate": 1.8323554302364273e-05, "loss": 0.1583, "num_tokens": 23161717.0, "reward": -2.1016845703125, "reward_std": 1.4978389739990234, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.867431640625, "rewards/ppl_reward/std": 7.663597583770752, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.2076999396085739, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 124.09375, "completions/mean_terminated_length": 124.09375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 1.3435881815412731, "grad_norm": 2.139749765396118, "kl": 2.83203125, "learning_rate": 1.8318832578189446e-05, "loss": 0.1172, "num_tokens": 23176875.0, "reward": -1.0472412109375, "reward_std": 0.2955505847930908, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.985107421875, "rewards/ppl_reward/std": 2.1849465370178223, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 114.3125, "completions/mean_terminated_length": 114.3125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.3448065793481572, "grad_norm": 1.528130054473877, "kl": 3.48046875, "learning_rate": 1.8314104824124876e-05, "loss": 0.0948, "num_tokens": 23190751.0, "reward": -2.289794921875, "reward_std": 1.4137365818023682, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.35302734375, "rewards/ppl_reward/std": 9.40345287322998, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 120.5, "completions/mean_terminated_length": 120.5, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 1.3460249771550412, "grad_norm": 1.9358323812484741, "kl": 3.365234375, "learning_rate": 1.8309371043597472e-05, "loss": 0.179, "num_tokens": 23206047.0, "reward": -0.875, "reward_std": 0.3634450435638428, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.6171875, "rewards/ppl_reward/std": 3.195996046066284, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 103.1875, "completions/mean_terminated_length": 103.1875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.347243374961925, "grad_norm": 5.922077178955078, "kl": 5.6875, "learning_rate": 1.8304631240038508e-05, "loss": 0.2254, "num_tokens": 23219379.0, "reward": -1.7464599609375, "reward_std": 1.3928847312927246, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.133544921875, "rewards/ppl_reward/std": 4.297600269317627, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16347388923168182, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 137.34375, "completions/mean_terminated_length": 137.34375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.348461772768809, "grad_norm": 1.5883305072784424, "kl": 2.9267578125, "learning_rate": 1.8299885416883616e-05, "loss": 0.078, "num_tokens": 23236345.0, "reward": -0.33868408203125, "reward_std": 0.5097336173057556, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.5367431640625, "rewards/ppl_reward/std": 2.369926691055298, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 133.21875, "completions/mean_terminated_length": 133.21875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 1.349680170575693, "grad_norm": 2.8526360988616943, "kl": 6.416015625, "learning_rate": 1.82951335775728e-05, "loss": 0.3362, "num_tokens": 23252511.0, "reward": -1.470947265625, "reward_std": 0.4325962960720062, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.71533203125, "rewards/ppl_reward/std": 2.645942449569702, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 130.71875, "completions/mean_terminated_length": 130.71875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.350898568382577, "grad_norm": 4.742530822753906, "kl": 4.1630859375, "learning_rate": 1.8290375725550417e-05, "loss": 0.1371, "num_tokens": 23267701.0, "reward": -1.85125732421875, "reward_std": 0.8512612581253052, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.5072021484375, "rewards/ppl_reward/std": 6.649858474731445, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.16587424278259277, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 1.352116966189461, "grad_norm": 2.3467929363250732, "kl": 6.8046875, "learning_rate": 1.8285611864265192e-05, "loss": 0.2837, "num_tokens": 23285805.0, "reward": -0.87353515625, "reward_std": 0.5601216554641724, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.4189453125, "rewards/ppl_reward/std": 2.0941476821899414, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13524486124515533, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 144.28125, "completions/mean_terminated_length": 144.28125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.3533353639963448, "grad_norm": 2.5982067584991455, "kl": 5.7109375, "learning_rate": 1.8280841997170203e-05, "loss": 0.2501, "num_tokens": 23302655.0, "reward": -0.938720703125, "reward_std": 1.0174682140350342, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.37744140625, "rewards/ppl_reward/std": 2.8197057247161865, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.2578144073486328, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 120.3125, "completions/mean_terminated_length": 120.3125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 1.3545537618032286, "grad_norm": 3.378563642501831, "kl": 4.78125, "learning_rate": 1.827606612772287e-05, "loss": 0.0854, "num_tokens": 23317027.0, "reward": -1.4151611328125, "reward_std": 1.4168128967285156, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.267822265625, "rewards/ppl_reward/std": 4.443974494934082, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.27048972249031067, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 126.34375, "completions/mean_terminated_length": 126.34375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 1.3557721596101127, "grad_norm": 4.7890214920043945, "kl": 7.65625, "learning_rate": 1.827128425938498e-05, "loss": 0.1805, "num_tokens": 23332945.0, "reward": -1.56787109375, "reward_std": 1.9519903659820557, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.4013671875, "rewards/ppl_reward/std": 4.072503566741943, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.3294980227947235, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 118.421875, "completions/mean_terminated_length": 118.421875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 1.3569905574169967, "grad_norm": 3.2253732681274414, "kl": 4.041015625, "learning_rate": 1.826649639562266e-05, "loss": 0.0767, "num_tokens": 23347652.0, "reward": -0.5526123046875, "reward_std": 1.0058224201202393, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.589599609375, "rewards/ppl_reward/std": 2.6405396461486816, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.3052735924720764, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 123.484375, "completions/mean_terminated_length": 123.484375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 1.3582089552238805, "grad_norm": 1.9179315567016602, "kl": 6.19921875, "learning_rate": 1.826170253990638e-05, "loss": 0.1137, "num_tokens": 23362547.0, "reward": -3.2100830078125, "reward_std": 2.179152727127075, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -9.584228515625, "rewards/ppl_reward/std": 4.49162483215332, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.3597899079322815, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 124.15625, "completions/mean_terminated_length": 124.15625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.3594273530307646, "grad_norm": 3.4625821113586426, "kl": 5.96875, "learning_rate": 1.8256902695710962e-05, "loss": 0.1551, "num_tokens": 23378093.0, "reward": -3.556396484375, "reward_std": 1.6067014932632446, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -10.27685546875, "rewards/ppl_reward/std": 13.833528518676758, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.37332215905189514, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 121.6875, "completions/mean_terminated_length": 121.6875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.3606457508376484, "grad_norm": 2.2693088054656982, "kl": 2.083984375, "learning_rate": 1.825209686651556e-05, "loss": 0.0174, "num_tokens": 23392961.0, "reward": -0.23828125, "reward_std": 0.706019401550293, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.125, "rewards/ppl_reward/std": 1.4551358222961426, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21931616961956024, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 113.3125, "completions/mean_terminated_length": 113.3125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.3618641486445324, "grad_norm": 5.020993709564209, "kl": 4.42578125, "learning_rate": 1.8247285055803672e-05, "loss": 0.0937, "num_tokens": 23407157.0, "reward": -4.0234375, "reward_std": 1.4538452625274658, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -11.4140625, "rewards/ppl_reward/std": 8.2958345413208, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.3357197046279907, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 106.34375, "completions/mean_terminated_length": 106.34375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 1.3630825464514165, "grad_norm": 2.089416027069092, "kl": 4.60546875, "learning_rate": 1.824246726706313e-05, "loss": 0.0559, "num_tokens": 23420659.0, "reward": -1.1790771484375, "reward_std": 1.3645910024642944, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -5.490966796875, "rewards/ppl_reward/std": 3.1555769443511963, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.3200235366821289, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 107.71875, "completions/mean_terminated_length": 107.71875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.3643009442583003, "grad_norm": 5.095607280731201, "kl": 6.3359375, "learning_rate": 1.8237643503786097e-05, "loss": 0.0999, "num_tokens": 23434137.0, "reward": -2.5928955078125, "reward_std": 2.165386438369751, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.5, "rewards/ppl_reward/mean": -7.857666015625, "rewards/ppl_reward/std": 7.282750606536865, "rewards/tag_count_reward/mean": 0.7734375, "rewards/tag_count_reward/std": 0.3665550649166107, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 106.125, "completions/mean_terminated_length": 106.125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.3655193420651843, "grad_norm": 2.8090946674346924, "kl": 4.703125, "learning_rate": 1.8232813769469068e-05, "loss": 0.0388, "num_tokens": 23448081.0, "reward": -2.529296875, "reward_std": 1.5894581079483032, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.48795005679130554, "rewards/ppl_reward/mean": -7.09765625, "rewards/ppl_reward/std": 4.211729049682617, "rewards/tag_count_reward/mean": 0.64453125, "rewards/tag_count_reward/std": 0.3748139441013336, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 97.296875, "completions/mean_terminated_length": 97.296875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.3667377398720681, "grad_norm": 2.2347664833068848, "kl": 4.0703125, "learning_rate": 1.822797806761287e-05, "loss": 0.0627, "num_tokens": 23461708.0, "reward": -3.1458740234375, "reward_std": 1.4025458097457886, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4787135720252991, "rewards/ppl_reward/mean": -8.182373046875, "rewards/ppl_reward/std": 3.381335973739624, "rewards/tag_count_reward/mean": 0.6015625, "rewards/tag_count_reward/std": 0.36110159754753113, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 107.265625, "completions/mean_terminated_length": 107.265625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 1.3679561376789522, "grad_norm": 2.629166841506958, "kl": 2.771484375, "learning_rate": 1.8223136401722648e-05, "loss": -0.023, "num_tokens": 23475325.0, "reward": -2.3714599609375, "reward_std": 1.8683812618255615, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5029674172401428, "rewards/ppl_reward/mean": -7.156982421875, "rewards/ppl_reward/std": 4.644054889678955, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.3433890640735626, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 108.5, "completions/mean_terminated_length": 108.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 1.3691745354858362, "grad_norm": 1.8464223146438599, "kl": 4.125, "learning_rate": 1.821828877530788e-05, "loss": 0.1107, "num_tokens": 23489005.0, "reward": -2.0277099609375, "reward_std": 1.5405070781707764, "rewards/format_reward/mean": 0.453125, "rewards/format_reward/std": 0.501733124256134, "rewards/ppl_reward/mean": -6.547607421875, "rewards/ppl_reward/std": 5.92731237411499, "rewards/tag_count_reward/mean": 0.79296875, "rewards/tag_count_reward/std": 0.2973250150680542, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 121.171875, "completions/mean_terminated_length": 121.171875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.37039293329272, "grad_norm": 3.0496573448181152, "kl": 3.46484375, "learning_rate": 1.8213435191882354e-05, "loss": 0.1571, "num_tokens": 23504808.0, "reward": -1.0574951171875, "reward_std": 1.198624610900879, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5029674172401428, "rewards/ppl_reward/mean": -4.818115234375, "rewards/ppl_reward/std": 2.225447416305542, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.2868436276912689, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 109.90625, "completions/mean_terminated_length": 109.90625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.371611331099604, "grad_norm": 2.2124407291412354, "kl": 3.28125, "learning_rate": 1.820857565496418e-05, "loss": 0.1429, "num_tokens": 23518986.0, "reward": -1.564697265625, "reward_std": 0.9174513816833496, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.467176616191864, "rewards/ppl_reward/mean": -6.34033203125, "rewards/ppl_reward/std": 3.065783739089966, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.20918723940849304, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 104.421875, "completions/mean_terminated_length": 104.421875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.372829728906488, "grad_norm": 2.5135953426361084, "kl": 3.78515625, "learning_rate": 1.8203710168075786e-05, "loss": 0.1064, "num_tokens": 23533269.0, "reward": -3.00506591796875, "reward_std": 0.840840220451355, "rewards/format_reward/mean": 0.703125, "rewards/format_reward/std": 0.4604927599430084, "rewards/ppl_reward/mean": -9.2757568359375, "rewards/ppl_reward/std": 8.413545608520508, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 109.046875, "completions/mean_terminated_length": 109.046875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 1.374048126713372, "grad_norm": 2.3479864597320557, "kl": 5.3125, "learning_rate": 1.819883873474391e-05, "loss": 0.1974, "num_tokens": 23546960.0, "reward": -4.68359375, "reward_std": 4.042913913726807, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -12.765625, "rewards/ppl_reward/std": 17.515993118286133, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2184663861989975, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 94.3125, "completions/mean_terminated_length": 94.3125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.375266524520256, "grad_norm": 2.611091375350952, "kl": 5.703125, "learning_rate": 1.8193961358499606e-05, "loss": 0.2348, "num_tokens": 23559380.0, "reward": -2.9404296875, "reward_std": 0.9129561185836792, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -9.263671875, "rewards/ppl_reward/std": 6.742772102355957, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.13903142511844635, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 102.0, "completions/mean_terminated_length": 102.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.3764849223271398, "grad_norm": 3.2055835723876953, "kl": 7.2109375, "learning_rate": 1.8189078042878222e-05, "loss": 0.2934, "num_tokens": 23573044.0, "reward": -2.3712158203125, "reward_std": 1.8769267797470093, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -8.054931640625, "rewards/ppl_reward/std": 6.013472557067871, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.14773420989513397, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 107.390625, "completions/mean_terminated_length": 107.390625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.3777033201340236, "grad_norm": 4.5454607009887695, "kl": 8.33984375, "learning_rate": 1.8184188791419425e-05, "loss": 0.3459, "num_tokens": 23587213.0, "reward": -1.26275634765625, "reward_std": 1.040454626083374, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -5.8927001953125, "rewards/ppl_reward/std": 4.097024917602539, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.18395289778709412, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 102.65625, "completions/mean_terminated_length": 102.65625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 1.3789217179409077, "grad_norm": 6.230345249176025, "kl": 8.4765625, "learning_rate": 1.8179293607667177e-05, "loss": 0.3605, "num_tokens": 23600703.0, "reward": -1.227294921875, "reward_std": 1.008967399597168, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -5.71240234375, "rewards/ppl_reward/std": 2.479372501373291, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.22184601426124573, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 97.484375, "completions/mean_terminated_length": 97.484375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 1.3801401157477917, "grad_norm": 3.4587063789367676, "kl": 7.0625, "learning_rate": 1.8174392495169747e-05, "loss": 0.4018, "num_tokens": 23613342.0, "reward": -4.85302734375, "reward_std": 1.3971624374389648, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -13.2451171875, "rewards/ppl_reward/std": 20.130964279174805, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1526367962360382, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 104.9375, "completions/mean_terminated_length": 104.9375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.3813585135546755, "grad_norm": 3.025589942932129, "kl": 7.0703125, "learning_rate": 1.816948545747969e-05, "loss": 0.3737, "num_tokens": 23627546.0, "reward": -4.1700439453125, "reward_std": 2.3295793533325195, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -11.738525390625, "rewards/ppl_reward/std": 17.92730140686035, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.21183809638023376, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 104.5, "completions/mean_terminated_length": 104.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 1.3825769113615596, "grad_norm": 2.8332228660583496, "kl": 6.1171875, "learning_rate": 1.8164572498153875e-05, "loss": 0.3674, "num_tokens": 23641634.0, "reward": -1.3787841796875, "reward_std": 0.945637047290802, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.241943359375, "rewards/ppl_reward/std": 3.7740345001220703, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.180765300989151, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 104.296875, "completions/mean_terminated_length": 104.296875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.3837953091684434, "grad_norm": 2.2744927406311035, "kl": 6.3359375, "learning_rate": 1.8159653620753435e-05, "loss": 0.241, "num_tokens": 23655533.0, "reward": -2.7435302734375, "reward_std": 1.3290956020355225, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -8.862060546875, "rewards/ppl_reward/std": 4.345559597015381, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.21128857135772705, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 88.578125, "completions/mean_terminated_length": 88.578125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.3850137069753274, "grad_norm": 2.7266616821289062, "kl": 4.048828125, "learning_rate": 1.815472882884382e-05, "loss": 0.1877, "num_tokens": 23667490.0, "reward": -1.44415283203125, "reward_std": 1.4056708812713623, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.4273681640625, "rewards/ppl_reward/std": 5.2906494140625, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.19760315120220184, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 97.734375, "completions/mean_terminated_length": 97.734375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.3862321047822115, "grad_norm": 2.338831901550293, "kl": 4.28125, "learning_rate": 1.814979812599475e-05, "loss": 0.1338, "num_tokens": 23680625.0, "reward": -0.4251708984375, "reward_std": 0.6893582344055176, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.436279296875, "rewards/ppl_reward/std": 2.213066816329956, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19024936854839325, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 102.296875, "completions/mean_terminated_length": 102.296875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.3874505025890953, "grad_norm": 3.021136999130249, "kl": 3.8515625, "learning_rate": 1.8144861515780242e-05, "loss": 0.113, "num_tokens": 23694308.0, "reward": -2.6114501953125, "reward_std": 0.9625617265701294, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -9.011962890625, "rewards/ppl_reward/std": 6.726587295532227, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 108.421875, "completions/mean_terminated_length": 108.421875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.3886689003959793, "grad_norm": 3.5869181156158447, "kl": 3.998046875, "learning_rate": 1.8139919001778584e-05, "loss": 0.1366, "num_tokens": 23708487.0, "reward": -1.2244873046875, "reward_std": 0.4585367739200592, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.159912109375, "rewards/ppl_reward/std": 4.492055416107178, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11672777682542801, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 102.390625, "completions/mean_terminated_length": 102.390625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.3898872982028632, "grad_norm": 2.041027545928955, "kl": 2.3486328125, "learning_rate": 1.8134970587572345e-05, "loss": 0.0491, "num_tokens": 23722368.0, "reward": -0.1036376953125, "reward_std": 0.2640083134174347, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.082275390625, "rewards/ppl_reward/std": 1.359120488166809, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 120.59375, "completions/mean_terminated_length": 120.59375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.3911056960097472, "grad_norm": 2.2857885360717773, "kl": 7.84375, "learning_rate": 1.813001627674838e-05, "loss": 0.4057, "num_tokens": 23737918.0, "reward": -1.184814453125, "reward_std": 0.7426341772079468, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.89306640625, "rewards/ppl_reward/std": 4.444544792175293, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17951758205890656, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 101.578125, "completions/mean_terminated_length": 101.578125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.3923240938166312, "grad_norm": 3.8210322856903076, "kl": 4.0703125, "learning_rate": 1.8125056072897807e-05, "loss": 0.1887, "num_tokens": 23751131.0, "reward": -2.4661865234375, "reward_std": 0.7060589790344238, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.705810546875, "rewards/ppl_reward/std": 8.6532621383667, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 116.78125, "completions/mean_terminated_length": 116.78125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.393542491623515, "grad_norm": 3.0759682655334473, "kl": 5.4921875, "learning_rate": 1.812008997961602e-05, "loss": 0.327, "num_tokens": 23765773.0, "reward": -1.982421875, "reward_std": 0.7865005731582642, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.61328125, "rewards/ppl_reward/std": 3.5091888904571533, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.22736713290214539, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 120.671875, "completions/mean_terminated_length": 120.671875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 1.3947608894303989, "grad_norm": 4.781408309936523, "kl": 5.1640625, "learning_rate": 1.8115118000502685e-05, "loss": 0.3245, "num_tokens": 23780664.0, "reward": -2.0660400390625, "reward_std": 0.46502140164375305, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.866455078125, "rewards/ppl_reward/std": 8.146093368530273, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 110.671875, "completions/mean_terminated_length": 110.671875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.395979287237283, "grad_norm": 4.3979716300964355, "kl": 8.6484375, "learning_rate": 1.811014013916173e-05, "loss": 0.4233, "num_tokens": 23794067.0, "reward": -2.26318359375, "reward_std": 0.7117461562156677, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.1357421875, "rewards/ppl_reward/std": 4.718409061431885, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.13706642389297485, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 119.046875, "completions/mean_terminated_length": 119.046875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.397197685044167, "grad_norm": 9.240001678466797, "kl": 14.69140625, "learning_rate": 1.810515639920135e-05, "loss": 0.6918, "num_tokens": 23808454.0, "reward": -3.3853759765625, "reward_std": 2.151697874069214, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -10.145751953125, "rewards/ppl_reward/std": 15.246391296386719, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.21764887869358063, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 108.28125, "completions/mean_terminated_length": 108.28125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.3984160828510508, "grad_norm": 9.336506843566895, "kl": 9.03125, "learning_rate": 1.8100166784233996e-05, "loss": 0.3536, "num_tokens": 23821792.0, "reward": -7.390869140625, "reward_std": 1.9095139503479004, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -18.10205078125, "rewards/ppl_reward/std": 28.235864639282227, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.19822971522808075, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 123.234375, "completions/mean_terminated_length": 123.234375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.3996344806579348, "grad_norm": 7.516034126281738, "kl": 14.890625, "learning_rate": 1.809517129787638e-05, "loss": 0.7449, "num_tokens": 23836519.0, "reward": -0.63037109375, "reward_std": 0.9516947269439697, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -4.6357421875, "rewards/ppl_reward/std": 2.1233925819396973, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1883249133825302, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 109.625, "completions/mean_terminated_length": 109.625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.4008528784648187, "grad_norm": 4.929940700531006, "kl": 8.6484375, "learning_rate": 1.8090169943749477e-05, "loss": 0.3754, "num_tokens": 23850543.0, "reward": -0.8482666015625, "reward_std": 0.7873914241790771, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.329345703125, "rewards/ppl_reward/std": 3.5148940086364746, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.20740120112895966, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 108.21875, "completions/mean_terminated_length": 108.21875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 1.4020712762717027, "grad_norm": 4.076503753662109, "kl": 4.1796875, "learning_rate": 1.8085162725478502e-05, "loss": 0.0899, "num_tokens": 23864293.0, "reward": -1.3970947265625, "reward_std": 1.7217830419540405, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.434814453125, "rewards/ppl_reward/std": 4.854090690612793, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.2203386276960373, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 112.953125, "completions/mean_terminated_length": 112.953125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.4032896740785867, "grad_norm": 4.361641883850098, "kl": 10.1796875, "learning_rate": 1.8080149646692932e-05, "loss": 0.4336, "num_tokens": 23878442.0, "reward": -2.2127685546875, "reward_std": 2.0264387130737305, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.4732423722743988, "rewards/ppl_reward/mean": -7.409912109375, "rewards/ppl_reward/std": 6.635274410247803, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.29367929697036743, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 114.53125, "completions/mean_terminated_length": 114.53125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 1.4045080718854706, "grad_norm": 8.338552474975586, "kl": 10.46875, "learning_rate": 1.8075130711026484e-05, "loss": 0.4383, "num_tokens": 23892796.0, "reward": -1.7890625, "reward_std": 1.091988205909729, "rewards/format_reward/mean": 0.515625, "rewards/format_reward/std": 0.5037065148353577, "rewards/ppl_reward/mean": -6.28125, "rewards/ppl_reward/std": 3.785414934158325, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.209963858127594, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 105.609375, "completions/mean_terminated_length": 105.609375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.4057264696923546, "grad_norm": 3.5295791625976562, "kl": 6.0078125, "learning_rate": 1.8070105922117128e-05, "loss": 0.2354, "num_tokens": 23906067.0, "reward": -2.3909912109375, "reward_std": 2.728938579559326, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.49501484632492065, "rewards/ppl_reward/mean": -7.250732421875, "rewards/ppl_reward/std": 12.021830558776855, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.1883249133825302, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 113.75, "completions/mean_terminated_length": 113.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 1.4069448674992384, "grad_norm": 2.4986038208007812, "kl": 6.19140625, "learning_rate": 1.806507528360707e-05, "loss": 0.2997, "num_tokens": 23920635.0, "reward": -1.2623291015625, "reward_std": 0.9909846782684326, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4787135720252991, "rewards/ppl_reward/mean": -5.594970703125, "rewards/ppl_reward/std": 2.3917787075042725, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2088906317949295, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 116.796875, "completions/mean_terminated_length": 116.796875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.4081632653061225, "grad_norm": 2.054525136947632, "kl": 4.828125, "learning_rate": 1.806003879914276e-05, "loss": 0.1937, "num_tokens": 23935262.0, "reward": -2.7764892578125, "reward_std": 1.910400152206421, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -8.717041015625, "rewards/ppl_reward/std": 8.870408058166504, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.27792346477508545, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 116.40625, "completions/mean_terminated_length": 116.40625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.4093816631130065, "grad_norm": 4.464776039123535, "kl": 2.3828125, "learning_rate": 1.805499647237488e-05, "loss": 0.1075, "num_tokens": 23949488.0, "reward": -0.76806640625, "reward_std": 0.8798320293426514, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.2392578125, "rewards/ppl_reward/std": 4.51158332824707, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 104.71875, "completions/mean_terminated_length": 104.71875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.4106000609198903, "grad_norm": 2.651426076889038, "kl": 2.169921875, "learning_rate": 1.8049948306958356e-05, "loss": 0.0762, "num_tokens": 23962678.0, "reward": -1.3929443359375, "reward_std": 0.30199962854385376, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.684326171875, "rewards/ppl_reward/std": 3.0919034481048584, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 102.65625, "completions/mean_terminated_length": 102.65625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.4118184587267744, "grad_norm": 1.8374468088150024, "kl": 1.396484375, "learning_rate": 1.8044894306552338e-05, "loss": 0.0055, "num_tokens": 23975616.0, "reward": -2.79571533203125, "reward_std": 0.7431029081344604, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -9.5445556640625, "rewards/ppl_reward/std": 6.363065242767334, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 102.09375, "completions/mean_terminated_length": 102.09375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.4130368565336582, "grad_norm": 2.111323356628418, "kl": 2.169921875, "learning_rate": 1.8039834474820205e-05, "loss": 0.0557, "num_tokens": 23988790.0, "reward": -1.6500244140625, "reward_std": 0.49865126609802246, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.104736328125, "rewards/ppl_reward/std": 2.1744484901428223, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 114.28125, "completions/mean_terminated_length": 114.28125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 1.4142552543405422, "grad_norm": 2.891671895980835, "kl": 1.58984375, "learning_rate": 1.8034768815429577e-05, "loss": -0.0179, "num_tokens": 24003320.0, "reward": -1.5069580078125, "reward_std": 0.6456806063652039, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.748291015625, "rewards/ppl_reward/std": 2.8537678718566895, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15570341050624847, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 108.203125, "completions/mean_terminated_length": 108.203125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 1.4154736521474263, "grad_norm": 23.734968185424805, "kl": 3.10546875, "learning_rate": 1.8029697332052277e-05, "loss": 0.0793, "num_tokens": 24016653.0, "reward": -3.8115234375, "reward_std": 1.5112736225128174, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -10.873046875, "rewards/ppl_reward/std": 8.184126853942871, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.21764887869358063, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 127.640625, "completions/mean_terminated_length": 127.640625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.41669204995431, "grad_norm": 1.8757320642471313, "kl": 5.19140625, "learning_rate": 1.8024620028364363e-05, "loss": 0.1778, "num_tokens": 24032270.0, "reward": -3.43359375, "reward_std": 3.4456043243408203, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -10.2421875, "rewards/ppl_reward/std": 16.905406951904297, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.17747680842876434, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 113.0, "completions/mean_terminated_length": 113.0, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.417910447761194, "grad_norm": 2.305142402648926, "kl": 2.86328125, "learning_rate": 1.8019536908046114e-05, "loss": 0.107, "num_tokens": 24045926.0, "reward": -1.141845703125, "reward_std": 0.485755980014801, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.99462890625, "rewards/ppl_reward/std": 2.8777220249176025, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11672777682542801, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 124.03125, "completions/mean_terminated_length": 124.03125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.419128845568078, "grad_norm": 1.8131229877471924, "kl": 3.794921875, "learning_rate": 1.8014447974782013e-05, "loss": 0.0935, "num_tokens": 24060792.0, "reward": -1.2738037109375, "reward_std": 0.8755605220794678, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.086669921875, "rewards/ppl_reward/std": 3.39389967918396, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1767328530550003, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 130.703125, "completions/mean_terminated_length": 130.703125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 1.420347243374962, "grad_norm": 3.0022428035736084, "kl": 5.373046875, "learning_rate": 1.8009353232260766e-05, "loss": 0.2219, "num_tokens": 24076197.0, "reward": -0.7178955078125, "reward_std": 1.2257750034332275, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -4.927978515625, "rewards/ppl_reward/std": 3.4843363761901855, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.20918723940849304, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 136.03125, "completions/mean_terminated_length": 136.03125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.4215656411818458, "grad_norm": 2.753106117248535, "kl": 9.9921875, "learning_rate": 1.8004252684175286e-05, "loss": 0.4994, "num_tokens": 24091847.0, "reward": -2.204833984375, "reward_std": 1.6110090017318726, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -7.72216796875, "rewards/ppl_reward/std": 5.371333122253418, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.20653989911079407, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 139.203125, "completions/mean_terminated_length": 139.203125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 1.4227840389887298, "grad_norm": 3.3236827850341797, "kl": 8.05078125, "learning_rate": 1.79991463342227e-05, "loss": 0.3582, "num_tokens": 24107804.0, "reward": -1.381103515625, "reward_std": 1.126621961593628, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -6.09033203125, "rewards/ppl_reward/std": 2.6350529193878174, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2302463799715042, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 140.9375, "completions/mean_terminated_length": 140.9375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 1.4240024367956137, "grad_norm": 2.757984161376953, "kl": 6.4453125, "learning_rate": 1.7994034186104327e-05, "loss": 0.2779, "num_tokens": 24124008.0, "reward": -1.1434326171875, "reward_std": 0.8162230849266052, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.732177734375, "rewards/ppl_reward/std": 2.522624969482422, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1763816624879837, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 134.421875, "completions/mean_terminated_length": 134.421875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.4252208346024977, "grad_norm": 2.9239625930786133, "kl": 5.30078125, "learning_rate": 1.7988916243525706e-05, "loss": 0.1707, "num_tokens": 24139459.0, "reward": -2.4727783203125, "reward_std": 1.6951923370361328, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -8.414306640625, "rewards/ppl_reward/std": 10.370092391967773, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 134.421875, "completions/mean_terminated_length": 134.421875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 1.4264392324093818, "grad_norm": 1.8926186561584473, "kl": 4.78515625, "learning_rate": 1.7983792510196567e-05, "loss": 0.207, "num_tokens": 24154606.0, "reward": -1.701416015625, "reward_std": 1.603752851486206, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.89501953125, "rewards/ppl_reward/std": 4.6773600578308105, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 136.53125, "completions/mean_terminated_length": 136.53125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.4276576302162656, "grad_norm": 5.261727809906006, "kl": 3.29296875, "learning_rate": 1.7978662989830834e-05, "loss": 0.0208, "num_tokens": 24170384.0, "reward": -1.755859375, "reward_std": 0.9155998229980469, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.16015625, "rewards/ppl_reward/std": 2.8071722984313965, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2237938940525055, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 147.234375, "completions/mean_terminated_length": 147.234375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 1.4288760280231496, "grad_norm": 1.7297346591949463, "kl": 4.1796875, "learning_rate": 1.7973527686146636e-05, "loss": 0.1147, "num_tokens": 24187847.0, "reward": -0.65185546875, "reward_std": 0.7335302233695984, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.8896484375, "rewards/ppl_reward/std": 1.734045147895813, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19024936854839325, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 138.921875, "completions/mean_terminated_length": 138.921875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.4300944258300334, "grad_norm": 1.679976463317871, "kl": 3.71875, "learning_rate": 1.796838660286628e-05, "loss": 0.1836, "num_tokens": 24203746.0, "reward": -1.6051025390625, "reward_std": 0.6852353811264038, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.772705078125, "rewards/ppl_reward/std": 3.6938021183013916, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17817416787147522, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 142.109375, "completions/mean_terminated_length": 142.109375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.4313128236369175, "grad_norm": 2.5567305088043213, "kl": 4.765625, "learning_rate": 1.7963239743716277e-05, "loss": 0.1491, "num_tokens": 24220089.0, "reward": -0.7791748046875, "reward_std": 0.8546320199966431, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.066162109375, "rewards/ppl_reward/std": 2.4569201469421387, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2372427135705948, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 138.171875, "completions/mean_terminated_length": 138.171875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 1.4325312214438015, "grad_norm": 2.0024333000183105, "kl": 5.32421875, "learning_rate": 1.795808711242731e-05, "loss": 0.1305, "num_tokens": 24235740.0, "reward": -1.802734375, "reward_std": 1.6114801168441772, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.99609375, "rewards/ppl_reward/std": 5.120520114898682, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2345155030488968, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 131.46875, "completions/mean_terminated_length": 131.46875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.4337496192506853, "grad_norm": 2.54262638092041, "kl": 6.12109375, "learning_rate": 1.7952928712734266e-05, "loss": 0.135, "num_tokens": 24251122.0, "reward": -1.35107421875, "reward_std": 1.2997504472732544, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.1474609375, "rewards/ppl_reward/std": 4.211683750152588, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2287265807390213, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 144.59375, "completions/mean_terminated_length": 144.59375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.4349680170575694, "grad_norm": 1.885773777961731, "kl": 5.87109375, "learning_rate": 1.7947764548376194e-05, "loss": 0.1825, "num_tokens": 24268128.0, "reward": -1.2252197265625, "reward_std": 0.9859084486961365, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.997314453125, "rewards/ppl_reward/std": 3.9773097038269043, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.18617255985736847, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 140.40625, "completions/mean_terminated_length": 140.40625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 1.4361864148644532, "grad_norm": 1.657332181930542, "kl": 4.189453125, "learning_rate": 1.794259462309632e-05, "loss": 0.109, "num_tokens": 24284218.0, "reward": -1.048095703125, "reward_std": 1.1142274141311646, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.76025390625, "rewards/ppl_reward/std": 5.51830530166626, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13768701255321503, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 133.71875, "completions/mean_terminated_length": 133.71875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.4374048126713372, "grad_norm": 2.5961410999298096, "kl": 5.55859375, "learning_rate": 1.7937418940642076e-05, "loss": 0.1471, "num_tokens": 24299216.0, "reward": -1.3843994140625, "reward_std": 0.936252236366272, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.276611328125, "rewards/ppl_reward/std": 3.4079556465148926, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2025614231824875, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 150.03125, "completions/mean_terminated_length": 150.03125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 1.438623210478221, "grad_norm": 2.7191896438598633, "kl": 5.396484375, "learning_rate": 1.7932237504765025e-05, "loss": 0.1683, "num_tokens": 24315954.0, "reward": -1.0799560546875, "reward_std": 1.1740164756774902, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.652099609375, "rewards/ppl_reward/std": 3.2918660640716553, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.20918723940849304, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 135.46875, "completions/mean_terminated_length": 135.46875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.439841608285105, "grad_norm": 1.6518909931182861, "kl": 4.84375, "learning_rate": 1.792705031922093e-05, "loss": 0.0926, "num_tokens": 24331208.0, "reward": -2.0791015625, "reward_std": 2.0644590854644775, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.681640625, "rewards/ppl_reward/std": 9.60647201538086, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18496133387088776, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 138.046875, "completions/mean_terminated_length": 138.046875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.441060006091989, "grad_norm": 2.40038800239563, "kl": 4.537109375, "learning_rate": 1.7921857387769712e-05, "loss": 0.0824, "num_tokens": 24347243.0, "reward": -4.6544189453125, "reward_std": 2.354243278503418, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -12.761962890625, "rewards/ppl_reward/std": 17.497886657714844, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2662152051925659, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 132.34375, "completions/mean_terminated_length": 132.34375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.442278403898873, "grad_norm": 1.8723747730255127, "kl": 3.78125, "learning_rate": 1.7916658714175456e-05, "loss": 0.0601, "num_tokens": 24362657.0, "reward": -0.8070068359375, "reward_std": 0.753936767578125, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.262451171875, "rewards/ppl_reward/std": 1.6168168783187866, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1489359587430954, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 135.71875, "completions/mean_terminated_length": 135.71875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.443496801705757, "grad_norm": 2.222844362258911, "kl": 4.3671875, "learning_rate": 1.7911454302206408e-05, "loss": 0.0819, "num_tokens": 24378455.0, "reward": -2.1666259765625, "reward_std": 1.1219538450241089, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -7.778564453125, "rewards/ppl_reward/std": 7.079843997955322, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1767328530550003, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 126.953125, "completions/mean_terminated_length": 126.953125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.4447151995126408, "grad_norm": 2.1578853130340576, "kl": 3.388671875, "learning_rate": 1.790624415563498e-05, "loss": 0.0646, "num_tokens": 24392980.0, "reward": -1.36968994140625, "reward_std": 1.5058690309524536, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.3018798828125, "rewards/ppl_reward/std": 6.03299617767334, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 133.78125, "completions/mean_terminated_length": 133.78125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 1.4459335973195249, "grad_norm": 1.6942471265792847, "kl": 3.896484375, "learning_rate": 1.7901028278237736e-05, "loss": 0.063, "num_tokens": 24408574.0, "reward": -0.7113037109375, "reward_std": 0.47168877720832825, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.125732421875, "rewards/ppl_reward/std": 1.157454013824463, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 136.234375, "completions/mean_terminated_length": 136.234375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 1.4471519951264087, "grad_norm": 2.6858551502227783, "kl": 3.353515625, "learning_rate": 1.7895806673795396e-05, "loss": 0.0371, "num_tokens": 24424733.0, "reward": -0.7288818359375, "reward_std": 0.694329023361206, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.098388671875, "rewards/ppl_reward/std": 2.0521488189697266, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 143.21875, "completions/mean_terminated_length": 143.21875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 1.4483703929332927, "grad_norm": 1.6845473051071167, "kl": 2.3828125, "learning_rate": 1.7890579346092828e-05, "loss": 0.129, "num_tokens": 24440659.0, "reward": -5.853515625, "reward_std": 1.2447900772094727, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -15.40234375, "rewards/ppl_reward/std": 21.073305130004883, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 144.796875, "completions/mean_terminated_length": 144.796875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.4495887907401768, "grad_norm": 1.4114511013031006, "kl": 3.05859375, "learning_rate": 1.7885346298919047e-05, "loss": 0.0242, "num_tokens": 24457014.0, "reward": -0.8599853515625, "reward_std": 0.4724671542644501, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.423095703125, "rewards/ppl_reward/std": 1.8264704942703247, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1423913538455963, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 161.15625, "completions/mean_terminated_length": 161.15625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 1.4508071885470606, "grad_norm": 1.9438704252243042, "kl": 5.7626953125, "learning_rate": 1.788010753606722e-05, "loss": 0.2396, "num_tokens": 24476008.0, "reward": -1.9346923828125, "reward_std": 0.6234562397003174, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.322509765625, "rewards/ppl_reward/std": 4.476972579956055, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.21463678777217865, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 136.28125, "completions/mean_terminated_length": 136.28125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 1.4520255863539446, "grad_norm": 3.0195748805999756, "kl": 5.1640625, "learning_rate": 1.7874863061334658e-05, "loss": 0.1685, "num_tokens": 24491202.0, "reward": -0.5233154296875, "reward_std": 0.8944854140281677, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.546630859375, "rewards/ppl_reward/std": 1.81948983669281, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17817416787147522, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 134.015625, "completions/mean_terminated_length": 134.015625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.4532439841608285, "grad_norm": 1.7285772562026978, "kl": 2.8515625, "learning_rate": 1.7869612878522805e-05, "loss": 0.0607, "num_tokens": 24506563.0, "reward": -0.7265625, "reward_std": 0.5481100082397461, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.0625, "rewards/ppl_reward/std": 2.664066791534424, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16347388923168182, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 151.484375, "completions/mean_terminated_length": 151.484375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.4544623819677125, "grad_norm": 4.4956746101379395, "kl": 7.265625, "learning_rate": 1.7864356991437243e-05, "loss": 0.3168, "num_tokens": 24523210.0, "reward": -5.3770751953125, "reward_std": 1.9445128440856934, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -14.176025390625, "rewards/ppl_reward/std": 16.42524528503418, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.24282869696617126, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 132.96875, "completions/mean_terminated_length": 132.96875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.4556807797745965, "grad_norm": 3.071831226348877, "kl": 7.453125, "learning_rate": 1.7859095403887697e-05, "loss": 0.2348, "num_tokens": 24538472.0, "reward": -0.892333984375, "reward_std": 1.2293455600738525, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -5.01904296875, "rewards/ppl_reward/std": 2.8223230838775635, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.2744719088077545, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 147.90625, "completions/mean_terminated_length": 147.90625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.4568991775814804, "grad_norm": 2.389322280883789, "kl": 6.2421875, "learning_rate": 1.7853828119688014e-05, "loss": 0.2337, "num_tokens": 24555626.0, "reward": -0.9827880859375, "reward_std": 0.8433812856674194, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.403076171875, "rewards/ppl_reward/std": 2.5878255367279053, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.22930191457271576, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 139.078125, "completions/mean_terminated_length": 139.078125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.4581175753883642, "grad_norm": 1.78847074508667, "kl": 3.375, "learning_rate": 1.7848555142656182e-05, "loss": 0.011, "num_tokens": 24571047.0, "reward": -1.3546142578125, "reward_std": 1.3857362270355225, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.217041015625, "rewards/ppl_reward/std": 4.560903072357178, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2025614231824875, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 160.875, "completions/mean_terminated_length": 160.875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.4593359731952482, "grad_norm": 1.4256489276885986, "kl": 2.87109375, "learning_rate": 1.7843276476614296e-05, "loss": 0.0251, "num_tokens": 24588871.0, "reward": -0.7518310546875, "reward_std": 0.6550639867782593, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.214599609375, "rewards/ppl_reward/std": 3.2105374336242676, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 126.921875, "completions/mean_terminated_length": 126.921875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.4605543710021323, "grad_norm": 1.376035213470459, "kl": 2.5986328125, "learning_rate": 1.78379921253886e-05, "loss": 0.0364, "num_tokens": 24603266.0, "reward": -1.55450439453125, "reward_std": 0.8211263418197632, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.8043212890625, "rewards/ppl_reward/std": 4.9665751457214355, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14471296966075897, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 140.84375, "completions/mean_terminated_length": 140.84375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 1.461772768809016, "grad_norm": 1.4334379434585571, "kl": 1.80859375, "learning_rate": 1.783270209280944e-05, "loss": 0.0173, "num_tokens": 24619200.0, "reward": -0.799560546875, "reward_std": 0.5209977626800537, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.34130859375, "rewards/ppl_reward/std": 2.599001169204712, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 138.765625, "completions/mean_terminated_length": 138.765625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.4629911666159001, "grad_norm": 1.665827989578247, "kl": 2.0380859375, "learning_rate": 1.782740638271128e-05, "loss": 0.0087, "num_tokens": 24634905.0, "reward": -1.729248046875, "reward_std": 0.689362645149231, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.22412109375, "rewards/ppl_reward/std": 3.656710386276245, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.464209564422784, "grad_norm": 1.5035350322723389, "kl": 2.416015625, "learning_rate": 1.7822104998932715e-05, "loss": -0.0067, "num_tokens": 24651425.0, "reward": -1.04376220703125, "reward_std": 1.4777660369873047, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.7359619140625, "rewards/ppl_reward/std": 5.025398254394531, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2237938940525055, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 108.71875, "completions/mean_terminated_length": 108.71875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.465427962229668, "grad_norm": 1.8015666007995605, "kl": 3.0625, "learning_rate": 1.7816797945316434e-05, "loss": 0.0496, "num_tokens": 24664495.0, "reward": -1.0489501953125, "reward_std": 0.9743242263793945, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.840087890625, "rewards/ppl_reward/std": 3.04058575630188, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 133.28125, "completions/mean_terminated_length": 133.28125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.466646360036552, "grad_norm": 2.8680808544158936, "kl": 2.5673828125, "learning_rate": 1.7811485225709255e-05, "loss": 0.0548, "num_tokens": 24680081.0, "reward": -1.3232421875, "reward_std": 0.7001288533210754, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.380859375, "rewards/ppl_reward/std": 4.928562641143799, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.20638974010944366, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 125.21875, "completions/mean_terminated_length": 125.21875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 1.4678647578434358, "grad_norm": 2.34061336517334, "kl": 2.3798828125, "learning_rate": 1.7806166843962086e-05, "loss": 0.0681, "num_tokens": 24694823.0, "reward": -0.8953857421875, "reward_std": 0.37766218185424805, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.618896484375, "rewards/ppl_reward/std": 3.0553367137908936, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 130.234375, "completions/mean_terminated_length": 130.234375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.4690831556503199, "grad_norm": 3.660223960876465, "kl": 2.125, "learning_rate": 1.7800842803929947e-05, "loss": -0.0185, "num_tokens": 24710854.0, "reward": -0.2213134765625, "reward_std": 0.6258684396743774, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.145751953125, "rewards/ppl_reward/std": 1.7450777292251587, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18617255985736847, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 133.53125, "completions/mean_terminated_length": 133.53125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.4703015534572037, "grad_norm": 2.14733624458313, "kl": 3.58203125, "learning_rate": 1.7795513109471952e-05, "loss": 0.143, "num_tokens": 24726288.0, "reward": -1.34051513671875, "reward_std": 0.6095497012138367, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.3529052734375, "rewards/ppl_reward/std": 2.730161428451538, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 135.765625, "completions/mean_terminated_length": 135.765625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 1.4715199512640877, "grad_norm": 5.312027931213379, "kl": 8.671875, "learning_rate": 1.7790177764451333e-05, "loss": 0.3698, "num_tokens": 24742233.0, "reward": -2.010986328125, "reward_std": 1.5115633010864258, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.46728515625, "rewards/ppl_reward/std": 5.161105155944824, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.25898414850234985, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 125.046875, "completions/mean_terminated_length": 125.046875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 1.4727383490709718, "grad_norm": 9.85858154296875, "kl": 9.42578125, "learning_rate": 1.7784836772735393e-05, "loss": 0.2209, "num_tokens": 24757036.0, "reward": -4.321533203125, "reward_std": 4.848263263702393, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -11.84619140625, "rewards/ppl_reward/std": 17.862302780151367, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.25185325741767883, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 124.171875, "completions/mean_terminated_length": 124.171875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.4739567468778556, "grad_norm": 4.911214828491211, "kl": 8.4140625, "learning_rate": 1.7779490138195544e-05, "loss": 0.3611, "num_tokens": 24771799.0, "reward": -1.802490234375, "reward_std": 1.5754311084747314, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -7.03466796875, "rewards/ppl_reward/std": 4.907349109649658, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2229611724615097, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 137.84375, "completions/mean_terminated_length": 137.84375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.4751751446847396, "grad_norm": 2.8679370880126953, "kl": 6.212890625, "learning_rate": 1.7774137864707283e-05, "loss": 0.3045, "num_tokens": 24789197.0, "reward": -1.0963134765625, "reward_std": 1.2404361963272095, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.723876953125, "rewards/ppl_reward/std": 5.009360313415527, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 122.25, "completions/mean_terminated_length": 122.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.4763935424916235, "grad_norm": 4.0061140060424805, "kl": 8.734375, "learning_rate": 1.7768779956150196e-05, "loss": 0.3196, "num_tokens": 24803637.0, "reward": -4.2142333984375, "reward_std": 1.5130248069763184, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -11.639404296875, "rewards/ppl_reward/std": 16.950393676757812, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.21789801120758057, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 114.078125, "completions/mean_terminated_length": 114.078125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 1.4776119402985075, "grad_norm": 2.7971110343933105, "kl": 6.1875, "learning_rate": 1.7763416416407953e-05, "loss": 0.2136, "num_tokens": 24817370.0, "reward": -2.064453125, "reward_std": 1.955367088317871, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.59765625, "rewards/ppl_reward/std": 7.191704750061035, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1883249133825302, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 131.140625, "completions/mean_terminated_length": 131.140625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.4788303381053913, "grad_norm": 1.8505464792251587, "kl": 5.390625, "learning_rate": 1.7758047249368306e-05, "loss": 0.1407, "num_tokens": 24832763.0, "reward": -2.2987060546875, "reward_std": 1.9428081512451172, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -8.034912109375, "rewards/ppl_reward/std": 6.90710973739624, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1985812783241272, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 127.671875, "completions/mean_terminated_length": 127.671875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.4800487359122754, "grad_norm": 4.303714275360107, "kl": 3.69140625, "learning_rate": 1.7752672458923078e-05, "loss": 0.161, "num_tokens": 24848254.0, "reward": -0.7835693359375, "reward_std": 0.9790339469909668, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.199951171875, "rewards/ppl_reward/std": 2.6387720108032227, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1767328530550003, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 108.15625, "completions/mean_terminated_length": 108.15625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.4812671337191592, "grad_norm": 2.2136809825897217, "kl": 2.5859375, "learning_rate": 1.7747292048968188e-05, "loss": 0.0655, "num_tokens": 24862152.0, "reward": -2.0234375, "reward_std": 2.608311653137207, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.75, "rewards/ppl_reward/std": 9.568670272827148, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 120.046875, "completions/mean_terminated_length": 120.046875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.4824855315260432, "grad_norm": 1.8018066883087158, "kl": 3.296875, "learning_rate": 1.774190602340361e-05, "loss": 0.1451, "num_tokens": 24876931.0, "reward": -1.9742431640625, "reward_std": 0.5983870625495911, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.651611328125, "rewards/ppl_reward/std": 4.648141860961914, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13524486124515533, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 112.140625, "completions/mean_terminated_length": 112.140625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.4837039293329273, "grad_norm": 1.6159971952438354, "kl": 1.2236328125, "learning_rate": 1.7736514386133386e-05, "loss": 0.0416, "num_tokens": 24891532.0, "reward": -0.3046875, "reward_std": 0.12660673260688782, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.609375, "rewards/ppl_reward/std": 1.847951054573059, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 122.6875, "completions/mean_terminated_length": 122.6875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.484922327139811, "grad_norm": 2.010835886001587, "kl": 2.302734375, "learning_rate": 1.7731117141065643e-05, "loss": 0.0085, "num_tokens": 24907104.0, "reward": -1.3472900390625, "reward_std": 0.7791980504989624, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.374267578125, "rewards/ppl_reward/std": 3.5529065132141113, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1791718602180481, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 217.0, "completions/mean_terminated_length": 117.89473724365234, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.4861407249466951, "grad_norm": 1.6152691841125488, "kl": 2.791015625, "learning_rate": 1.7725714292112567e-05, "loss": 0.284, "num_tokens": 24928736.0, "reward": -0.5625, "reward_std": 0.8927847743034363, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -4.390625, "rewards/ppl_reward/std": 2.091918706893921, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2123131901025772, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 100.59375, "completions/mean_terminated_length": 100.59375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.487359122753579, "grad_norm": 2.890115976333618, "kl": 3.8662109375, "learning_rate": 1.7720305843190393e-05, "loss": 0.1394, "num_tokens": 24941150.0, "reward": -1.67919921875, "reward_std": 0.765838623046875, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.0380859375, "rewards/ppl_reward/std": 5.244996070861816, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15545432269573212, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 121.609375, "completions/mean_terminated_length": 121.609375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 1.488577520560463, "grad_norm": 3.140943765640259, "kl": 6.1484375, "learning_rate": 1.7714891798219432e-05, "loss": 0.2612, "num_tokens": 24956469.0, "reward": -1.2384033203125, "reward_std": 1.1445451974868774, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.086181640625, "rewards/ppl_reward/std": 5.177441120147705, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 136.28125, "completions/mean_terminated_length": 122.19048309326172, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.489795918367347, "grad_norm": 2.1601860523223877, "kl": 2.107421875, "learning_rate": 1.770947216112404e-05, "loss": 0.2328, "num_tokens": 24972511.0, "reward": -0.141845703125, "reward_std": 0.2858581244945526, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.07275390625, "rewards/ppl_reward/std": 0.7879289388656616, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 175.96875, "completions/mean_terminated_length": 119.43334197998047, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.4910143161742309, "grad_norm": 4.825282573699951, "kl": 8.00244140625, "learning_rate": 1.770404693583263e-05, "loss": 0.4873, "num_tokens": 24990821.0, "reward": -1.109375, "reward_std": 0.7641588449478149, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -5.5390625, "rewards/ppl_reward/std": 3.2861735820770264, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.1909000724554062, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 115.875, "completions/mean_terminated_length": 115.875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 1.492232713981115, "grad_norm": 2.5151045322418213, "kl": 6.34375, "learning_rate": 1.769861612627767e-05, "loss": 0.2891, "num_tokens": 25005069.0, "reward": -2.1793212890625, "reward_std": 2.117349624633789, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.975830078125, "rewards/ppl_reward/std": 8.30933952331543, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1489359587430954, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 232.1875, "completions/mean_terminated_length": 119.0714340209961, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.4934511117879987, "grad_norm": 5.565193176269531, "kl": 7.1328125, "learning_rate": 1.769317973639567e-05, "loss": 0.2469, "num_tokens": 25026857.0, "reward": -1.4307861328125, "reward_std": 0.8852856159210205, "rewards/format_reward/mean": 0.703125, "rewards/format_reward/std": 0.4604927599430084, "rewards/ppl_reward/mean": -6.033447265625, "rewards/ppl_reward/std": 4.620990753173828, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.21807578206062317, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 123.234375, "completions/mean_terminated_length": 108.93651580810547, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.4946695095948828, "grad_norm": 5.307079792022705, "kl": 8.15625, "learning_rate": 1.768773777012719e-05, "loss": 0.3709, "num_tokens": 25041232.0, "reward": -2.68994140625, "reward_std": 2.760920524597168, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.8720703125, "rewards/ppl_reward/std": 14.123530387878418, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.19444002211093903, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 129.609375, "completions/mean_terminated_length": 129.609375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.4958879074017668, "grad_norm": 5.357006072998047, "kl": 11.703125, "learning_rate": 1.768229023141682e-05, "loss": 0.5076, "num_tokens": 25056319.0, "reward": -4.83935546875, "reward_std": 2.3091769218444824, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -13.0458984375, "rewards/ppl_reward/std": 10.712199211120605, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.19444002211093903, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 126.265625, "completions/mean_terminated_length": 112.01587677001953, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.4971063052086506, "grad_norm": 2.7041876316070557, "kl": 6.21875, "learning_rate": 1.7676837124213202e-05, "loss": 0.2882, "num_tokens": 25070920.0, "reward": -0.6258544921875, "reward_std": 1.0498936176300049, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -4.697021484375, "rewards/ppl_reward/std": 2.008450746536255, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1763816624879837, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 149.484375, "completions/mean_terminated_length": 121.2741928100586, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.4983247030155344, "grad_norm": 2.401214361190796, "kl": 6.302734375, "learning_rate": 1.7671378452469013e-05, "loss": 0.4032, "num_tokens": 25086871.0, "reward": -1.2890625, "reward_std": 1.1461282968521118, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.1015625, "rewards/ppl_reward/std": 3.9198575019836426, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18496133387088776, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 119.421875, "completions/mean_terminated_length": 119.421875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 1.4995431008224185, "grad_norm": 7.6617021560668945, "kl": 8.2734375, "learning_rate": 1.7665914220140964e-05, "loss": 0.3702, "num_tokens": 25101050.0, "reward": -2.5562744140625, "reward_std": 1.478743314743042, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -8.471923828125, "rewards/ppl_reward/std": 3.8839375972747803, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.25185325741767883, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 112.78125, "completions/mean_terminated_length": 112.78125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 1.5007614986293025, "grad_norm": 4.115344524383545, "kl": 5.671875, "learning_rate": 1.766044443118978e-05, "loss": 0.2495, "num_tokens": 25114900.0, "reward": -2.5172119140625, "reward_std": 1.277557134628296, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.612548828125, "rewards/ppl_reward/std": 6.325276851654053, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15728822350502014, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 156.671875, "completions/mean_terminated_length": 156.671875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.5019798964361866, "grad_norm": 2.4766008853912354, "kl": 2.796875, "learning_rate": 1.7654969089580244e-05, "loss": 0.1337, "num_tokens": 25132127.0, "reward": -1.289306640625, "reward_std": 0.6315191984176636, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.38330078125, "rewards/ppl_reward/std": 2.9857659339904785, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 111.28125, "completions/mean_terminated_length": 111.28125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.5031982942430704, "grad_norm": 3.1151726245880127, "kl": 3.576171875, "learning_rate": 1.764948819928113e-05, "loss": 0.0992, "num_tokens": 25145897.0, "reward": -0.4007568359375, "reward_std": 0.7305126786231995, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.379638671875, "rewards/ppl_reward/std": 1.3436943292617798, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1508491188287735, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 125.765625, "completions/mean_terminated_length": 125.765625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.5044166920499542, "grad_norm": 4.628190517425537, "kl": 3.947265625, "learning_rate": 1.7644001764265264e-05, "loss": 0.1131, "num_tokens": 25160778.0, "reward": -1.5260009765625, "reward_std": 0.6413519382476807, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.637939453125, "rewards/ppl_reward/std": 4.909880638122559, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09827063977718353, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 118.90625, "completions/mean_terminated_length": 118.90625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 1.5056350898568382, "grad_norm": 1.8265304565429688, "kl": 3.388671875, "learning_rate": 1.763850978850947e-05, "loss": 0.2002, "num_tokens": 25174972.0, "reward": -1.623291015625, "reward_std": 0.7340237498283386, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.01220703125, "rewards/ppl_reward/std": 3.8304264545440674, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 124.03125, "completions/mean_terminated_length": 124.03125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.5068534876637223, "grad_norm": 1.920392632484436, "kl": 2.876953125, "learning_rate": 1.76330122759946e-05, "loss": 0.0586, "num_tokens": 25189878.0, "reward": -3.4066162109375, "reward_std": 2.0592844486236572, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -10.500732421875, "rewards/ppl_reward/std": 9.789756774902344, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 125.546875, "completions/mean_terminated_length": 125.546875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.508071885470606, "grad_norm": 2.026665449142456, "kl": 2.96875, "learning_rate": 1.762750923070551e-05, "loss": 0.1013, "num_tokens": 25205441.0, "reward": -1.77783203125, "reward_std": 1.0663410425186157, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.3603515625, "rewards/ppl_reward/std": 6.59158992767334, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 159.28125, "completions/mean_terminated_length": 145.55555725097656, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 1.5092902832774902, "grad_norm": 1.5625067949295044, "kl": 4.921875, "learning_rate": 1.7622000656631083e-05, "loss": 0.2635, "num_tokens": 25223691.0, "reward": -0.585205078125, "reward_std": 0.5032727122306824, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.76416015625, "rewards/ppl_reward/std": 1.8959451913833618, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1985812783241272, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 115.09375, "completions/mean_terminated_length": 115.09375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 1.510508681084374, "grad_norm": 3.450157642364502, "kl": 4.9033203125, "learning_rate": 1.7616486557764187e-05, "loss": 0.1948, "num_tokens": 25237153.0, "reward": -1.63720703125, "reward_std": 1.501244306564331, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.0556640625, "rewards/ppl_reward/std": 10.619990348815918, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 117.546875, "completions/mean_terminated_length": 117.546875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 1.511727078891258, "grad_norm": 1.6925697326660156, "kl": 2.2666015625, "learning_rate": 1.7610966938101713e-05, "loss": 0.1242, "num_tokens": 25251356.0, "reward": -0.451171875, "reward_std": 0.28816133737564087, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.78515625, "rewards/ppl_reward/std": 2.1350173950195312, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 112.015625, "completions/mean_terminated_length": 112.015625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.512945476698142, "grad_norm": 2.725729465484619, "kl": 4.2294921875, "learning_rate": 1.760544180164454e-05, "loss": 0.1683, "num_tokens": 25265077.0, "reward": -0.9229736328125, "reward_std": 0.9020363092422485, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.595947265625, "rewards/ppl_reward/std": 4.046694755554199, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 124.109375, "completions/mean_terminated_length": 124.109375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.5141638745050259, "grad_norm": 2.912515878677368, "kl": 8.1171875, "learning_rate": 1.759991115239756e-05, "loss": 0.332, "num_tokens": 25280012.0, "reward": -2.0, "reward_std": 0.9420903921127319, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.5859375, "rewards/ppl_reward/std": 5.672311305999756, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17951758205890656, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 136.75, "completions/mean_terminated_length": 136.75, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 1.5153822723119097, "grad_norm": 3.488030195236206, "kl": 7.8984375, "learning_rate": 1.7594374994369644e-05, "loss": 0.3086, "num_tokens": 25295980.0, "reward": -2.44293212890625, "reward_std": 1.7064332962036133, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.4014892578125, "rewards/ppl_reward/std": 7.856359481811523, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 142.765625, "completions/mean_terminated_length": 128.7777862548828, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 1.5166006701187937, "grad_norm": 3.294384002685547, "kl": 4.767578125, "learning_rate": 1.7588833331573677e-05, "loss": 0.3667, "num_tokens": 25312549.0, "reward": -1.175537109375, "reward_std": 0.8305699825286865, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.10107421875, "rewards/ppl_reward/std": 2.214508533477783, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 131.0, "completions/mean_terminated_length": 131.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.5178190679256778, "grad_norm": 2.4609122276306152, "kl": 7.0625, "learning_rate": 1.758328616802651e-05, "loss": 0.2758, "num_tokens": 25328429.0, "reward": -0.8289794921875, "reward_std": 1.5282353162765503, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.040771484375, "rewards/ppl_reward/std": 3.6840078830718994, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2348787635564804, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 142.4375, "completions/mean_terminated_length": 142.4375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 1.5190374657325618, "grad_norm": 1.7091758251190186, "kl": 5.0419921875, "learning_rate": 1.7577733507749007e-05, "loss": 0.2429, "num_tokens": 25344857.0, "reward": -0.2711181640625, "reward_std": 0.4641294479370117, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.339111328125, "rewards/ppl_reward/std": 1.6331442594528198, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11545931547880173, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 141.9375, "completions/mean_terminated_length": 141.9375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.5202558635394456, "grad_norm": 2.217139720916748, "kl": 6.6171875, "learning_rate": 1.7572175354765993e-05, "loss": 0.2869, "num_tokens": 25360869.0, "reward": -1.209228515625, "reward_std": 0.8507615923881531, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -5.76220703125, "rewards/ppl_reward/std": 4.269320011138916, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.20167945325374603, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 160.8125, "completions/mean_terminated_length": 132.96774291992188, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.5214742613463295, "grad_norm": 1.8901673555374146, "kl": 5.80859375, "learning_rate": 1.7566611713106287e-05, "loss": 0.3758, "num_tokens": 25378249.0, "reward": -1.88671875, "reward_std": 1.011513352394104, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.3203125, "rewards/ppl_reward/std": 5.186132907867432, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1110801100730896, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 144.765625, "completions/mean_terminated_length": 130.8095245361328, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 1.5226926591532135, "grad_norm": 3.5245752334594727, "kl": 5.32421875, "learning_rate": 1.756104258680269e-05, "loss": 0.3281, "num_tokens": 25394474.0, "reward": -1.8585205078125, "reward_std": 1.448853611946106, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -7.084228515625, "rewards/ppl_reward/std": 8.244622230529785, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.20918723940849304, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 148.09375, "completions/mean_terminated_length": 134.19049072265625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.5239110569600975, "grad_norm": 1.8885499238967896, "kl": 4.8203125, "learning_rate": 1.755546797989196e-05, "loss": 0.2379, "num_tokens": 25410664.0, "reward": -3.3260498046875, "reward_std": 1.1745309829711914, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -10.027099609375, "rewards/ppl_reward/std": 6.985666751861572, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1985812783241272, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 145.875, "completions/mean_terminated_length": 131.93650817871094, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 1.5251294547669816, "grad_norm": 1.2712628841400146, "kl": 2.78515625, "learning_rate": 1.7549887896414853e-05, "loss": 0.1636, "num_tokens": 25427456.0, "reward": -2.22509765625, "reward_std": 0.6841436624526978, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.0751953125, "rewards/ppl_reward/std": 7.012479782104492, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 122.6875, "completions/mean_terminated_length": 122.6875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.5263478525738654, "grad_norm": 3.6110355854034424, "kl": 5.70703125, "learning_rate": 1.7544302340416073e-05, "loss": 0.2728, "num_tokens": 25442076.0, "reward": -1.5828857421875, "reward_std": 1.0679720640182495, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -6.517333984375, "rewards/ppl_reward/std": 4.452244758605957, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.1802070438861847, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 106.890625, "completions/mean_terminated_length": 106.890625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.5275662503807492, "grad_norm": 1.7617990970611572, "kl": 3.28125, "learning_rate": 1.7538711315944304e-05, "loss": 0.029, "num_tokens": 25454941.0, "reward": -1.705322265625, "reward_std": 0.7708228826522827, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.96533203125, "rewards/ppl_reward/std": 3.096898078918457, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18496133387088776, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 162.96875, "completions/mean_terminated_length": 135.19354248046875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 1.5287846481876333, "grad_norm": 2.096646308898926, "kl": 1.59765625, "learning_rate": 1.7533114827052187e-05, "loss": 0.1013, "num_tokens": 25472627.0, "reward": -1.401123046875, "reward_std": 0.40701690316200256, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.53662109375, "rewards/ppl_reward/std": 2.0205345153808594, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07344620674848557, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 132.21875, "completions/mean_terminated_length": 132.21875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.5300030459945173, "grad_norm": 2.4069793224334717, "kl": 3.322265625, "learning_rate": 1.7527512877796327e-05, "loss": 0.0852, "num_tokens": 25488337.0, "reward": -1.424072265625, "reward_std": 0.577853798866272, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.48876953125, "rewards/ppl_reward/std": 3.890200614929199, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.10175786912441254, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 124.21875, "completions/mean_terminated_length": 124.21875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.5312214438014011, "grad_norm": 2.0629501342773438, "kl": 2.796875, "learning_rate": 1.752190547223729e-05, "loss": 0.0796, "num_tokens": 25503559.0, "reward": -1.33251953125, "reward_std": 1.5954936742782593, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.2822265625, "rewards/ppl_reward/std": 5.366974830627441, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19024936854839325, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 118.84375, "completions/mean_terminated_length": 118.84375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 1.532439841608285, "grad_norm": 2.444829225540161, "kl": 5.91015625, "learning_rate": 1.7516292614439586e-05, "loss": 0.1555, "num_tokens": 25517517.0, "reward": -1.48681640625, "reward_std": 0.9705718755722046, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.3798828125, "rewards/ppl_reward/std": 4.55509614944458, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 141.25, "completions/mean_terminated_length": 141.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 1.533658239415169, "grad_norm": 2.1661322116851807, "kl": 3.263671875, "learning_rate": 1.7510674308471687e-05, "loss": 0.1017, "num_tokens": 25534045.0, "reward": -0.60736083984375, "reward_std": 0.6375002861022949, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.9100341796875, "rewards/ppl_reward/std": 3.8074851036071777, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 172.953125, "completions/mean_terminated_length": 131.09835815429688, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.534876637222053, "grad_norm": 1.6090022325515747, "kl": 3.30224609375, "learning_rate": 1.7505050558406012e-05, "loss": 0.2889, "num_tokens": 25552298.0, "reward": -5.333984375, "reward_std": 5.245047569274902, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -14.16796875, "rewards/ppl_reward/std": 32.26267623901367, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20351573824882507, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 138.109375, "completions/mean_terminated_length": 124.04762268066406, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 1.536095035028937, "grad_norm": 1.708755612373352, "kl": 2.0068359375, "learning_rate": 1.7499421368318926e-05, "loss": 0.2277, "num_tokens": 25568065.0, "reward": -1.939453125, "reward_std": 1.294981837272644, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.58203125, "rewards/ppl_reward/std": 7.528308868408203, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 136.875, "completions/mean_terminated_length": 122.79365539550781, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.537313432835821, "grad_norm": 1.835400938987732, "kl": 2.986328125, "learning_rate": 1.7493786742290734e-05, "loss": 0.1703, "num_tokens": 25583753.0, "reward": -0.017822265625, "reward_std": 0.5248695611953735, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -3.73095703125, "rewards/ppl_reward/std": 0.8754581809043884, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 140.125, "completions/mean_terminated_length": 126.09524536132812, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.5385318306427047, "grad_norm": 1.6984745264053345, "kl": 3.4208984375, "learning_rate": 1.7488146684405686e-05, "loss": 0.3639, "num_tokens": 25599737.0, "reward": -0.5709228515625, "reward_std": 0.5381546020507812, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.751220703125, "rewards/ppl_reward/std": 2.062163829803467, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1508491188287735, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 127.234375, "completions/mean_terminated_length": 127.234375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.5397502284495888, "grad_norm": 3.642082691192627, "kl": 8.42578125, "learning_rate": 1.7482501198751968e-05, "loss": 0.3975, "num_tokens": 25614752.0, "reward": -2.791259765625, "reward_std": 2.1203792095184326, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -9.25439453125, "rewards/ppl_reward/std": 9.15778923034668, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 149.0625, "completions/mean_terminated_length": 135.1746063232422, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.5409686262564728, "grad_norm": 1.5230472087860107, "kl": 1.9580078125, "learning_rate": 1.7476850289421697e-05, "loss": 0.1781, "num_tokens": 25630756.0, "reward": -2.3221435546875, "reward_std": 0.7620047330856323, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.464599609375, "rewards/ppl_reward/std": 3.909147024154663, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 121.53125, "completions/mean_terminated_length": 121.53125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.5421870240633568, "grad_norm": 3.5802087783813477, "kl": 5.96484375, "learning_rate": 1.7471193960510924e-05, "loss": 0.1772, "num_tokens": 25645598.0, "reward": -2.3006591796875, "reward_std": 0.6932467222213745, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.148193359375, "rewards/ppl_reward/std": 8.39067554473877, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 135.84375, "completions/mean_terminated_length": 135.84375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.5434054218702407, "grad_norm": 3.0056536197662354, "kl": 6.5859375, "learning_rate": 1.7465532216119628e-05, "loss": 0.2536, "num_tokens": 25661868.0, "reward": -9.264892578125, "reward_std": 1.9500935077667236, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -22.21728515625, "rewards/ppl_reward/std": 44.3135871887207, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 135.015625, "completions/mean_terminated_length": 135.015625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.5446238196771245, "grad_norm": 1.9961495399475098, "kl": 3.337890625, "learning_rate": 1.7459865060351714e-05, "loss": 0.0896, "num_tokens": 25677533.0, "reward": -2.475341796875, "reward_std": 0.6929559707641602, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.73974609375, "rewards/ppl_reward/std": 5.976204872131348, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.17567719519138336, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 134.53125, "completions/mean_terminated_length": 134.53125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 1.5458422174840085, "grad_norm": 2.4256062507629395, "kl": 8.6796875, "learning_rate": 1.745419249731501e-05, "loss": 0.372, "num_tokens": 25693335.0, "reward": -0.626708984375, "reward_std": 0.9252616763114929, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.65966796875, "rewards/ppl_reward/std": 1.689084768295288, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2514837086200714, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 148.328125, "completions/mean_terminated_length": 134.42857360839844, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.5470606152908926, "grad_norm": 2.1969542503356934, "kl": 4.3388671875, "learning_rate": 1.7448514531121265e-05, "loss": 0.3086, "num_tokens": 25709716.0, "reward": -1.3624267578125, "reward_std": 0.6036715507507324, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.537353515625, "rewards/ppl_reward/std": 3.331547975540161, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 160.140625, "completions/mean_terminated_length": 146.42857360839844, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.5482790130977764, "grad_norm": 3.9473345279693604, "kl": 7.701171875, "learning_rate": 1.7442831165886134e-05, "loss": 0.3156, "num_tokens": 25727453.0, "reward": -0.9453125, "reward_std": 0.5840765237808228, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.4921875, "rewards/ppl_reward/std": 2.8682701587677, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.2076999396085739, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 140.671875, "completions/mean_terminated_length": 140.671875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.5494974109046604, "grad_norm": 1.9872393608093262, "kl": 4.810546875, "learning_rate": 1.7437142405729196e-05, "loss": 0.2442, "num_tokens": 25743216.0, "reward": -2.305419921875, "reward_std": 0.5547690391540527, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.27490234375, "rewards/ppl_reward/std": 9.209370613098145, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13768701255321503, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 151.03125, "completions/mean_terminated_length": 151.03125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.5507158087115442, "grad_norm": 2.755401134490967, "kl": 2.841796875, "learning_rate": 1.7431448254773943e-05, "loss": 0.1869, "num_tokens": 25760554.0, "reward": -0.3724365234375, "reward_std": 0.32629281282424927, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.572998046875, "rewards/ppl_reward/std": 2.037606716156006, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11545931547880173, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 139.703125, "completions/mean_terminated_length": 139.703125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.5519342065184283, "grad_norm": 2.0273149013519287, "kl": 4.36328125, "learning_rate": 1.7425748717147766e-05, "loss": 0.1223, "num_tokens": 25776783.0, "reward": -0.438232421875, "reward_std": 0.39738667011260986, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.59521484375, "rewards/ppl_reward/std": 2.2130239009857178, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 147.09375, "completions/mean_terminated_length": 147.09375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 1.5531526043253123, "grad_norm": 2.5969977378845215, "kl": 3.77734375, "learning_rate": 1.7420043796981964e-05, "loss": 0.2023, "num_tokens": 25793397.0, "reward": -0.18017578125, "reward_std": 0.4450739622116089, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.1962890625, "rewards/ppl_reward/std": 2.215622901916504, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09241779148578644, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 131.4375, "completions/mean_terminated_length": 131.4375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.5543710021321961, "grad_norm": 1.5502384901046753, "kl": 3.0791015625, "learning_rate": 1.7414333498411734e-05, "loss": 0.0829, "num_tokens": 25808505.0, "reward": -0.46453857421875, "reward_std": 0.47615116834640503, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.7884521484375, "rewards/ppl_reward/std": 2.5594112873077393, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 143.96875, "completions/mean_terminated_length": 143.96875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.55558939993908, "grad_norm": 1.7113749980926514, "kl": 4.0400390625, "learning_rate": 1.740861782557618e-05, "loss": 0.181, "num_tokens": 25825439.0, "reward": -1.1806640625, "reward_std": 0.38986068964004517, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.205078125, "rewards/ppl_reward/std": 3.9418442249298096, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 129.5625, "completions/mean_terminated_length": 129.5625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 1.556807797745964, "grad_norm": 1.6825942993164062, "kl": 3.55078125, "learning_rate": 1.7402896782618292e-05, "loss": 0.1012, "num_tokens": 25840179.0, "reward": -0.8065185546875, "reward_std": 0.7464643716812134, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.355224609375, "rewards/ppl_reward/std": 3.2224600315093994, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 132.640625, "completions/mean_terminated_length": 132.640625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.558026195552848, "grad_norm": 1.9690799713134766, "kl": 6.5546875, "learning_rate": 1.739717037368496e-05, "loss": 0.2505, "num_tokens": 25855516.0, "reward": -1.48345947265625, "reward_std": 1.057776927947998, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.4825439453125, "rewards/ppl_reward/std": 3.5538275241851807, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 136.59375, "completions/mean_terminated_length": 136.59375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.559244593359732, "grad_norm": 3.3360209465026855, "kl": 4.24609375, "learning_rate": 1.739143860292696e-05, "loss": 0.0794, "num_tokens": 25870922.0, "reward": -0.4659423828125, "reward_std": 0.77923583984375, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.595947265625, "rewards/ppl_reward/std": 1.726641058921814, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 141.9375, "completions/mean_terminated_length": 141.9375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 1.560462991166616, "grad_norm": 3.824669599533081, "kl": 4.54296875, "learning_rate": 1.7385701474498958e-05, "loss": 0.0956, "num_tokens": 25887318.0, "reward": -0.8336181640625, "reward_std": 0.7933820486068726, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.206298828125, "rewards/ppl_reward/std": 2.60775089263916, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.23007801175117493, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 127.234375, "completions/mean_terminated_length": 127.234375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.5616813889734997, "grad_norm": 2.2548575401306152, "kl": 5.4296875, "learning_rate": 1.7379958992559494e-05, "loss": 0.1397, "num_tokens": 25901949.0, "reward": -3.71307373046875, "reward_std": 2.3298182487487793, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -10.8792724609375, "rewards/ppl_reward/std": 13.159521102905273, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.23239074647426605, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 130.8125, "completions/mean_terminated_length": 130.8125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 1.5628997867803838, "grad_norm": 1.7732070684432983, "kl": 3.7294921875, "learning_rate": 1.7374211161271e-05, "loss": 0.1159, "num_tokens": 25917145.0, "reward": -4.5335693359375, "reward_std": 3.1496524810791016, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -12.723388671875, "rewards/ppl_reward/std": 18.51633071899414, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 129.390625, "completions/mean_terminated_length": 129.390625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.5641181845872678, "grad_norm": 1.7870711088180542, "kl": 3.3271484375, "learning_rate": 1.7368457984799777e-05, "loss": 0.0634, "num_tokens": 25931994.0, "reward": -0.306640625, "reward_std": 0.6420353651046753, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.26953125, "rewards/ppl_reward/std": 2.0515167713165283, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.20412415266036987, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 127.796875, "completions/mean_terminated_length": 127.796875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.5653365823941519, "grad_norm": 9.320108413696289, "kl": 3.90234375, "learning_rate": 1.7362699467316005e-05, "loss": 0.0218, "num_tokens": 25947173.0, "reward": -1.179443359375, "reward_std": 0.7457500696182251, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.89794921875, "rewards/ppl_reward/std": 2.3714728355407715, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1979166716337204, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 134.046875, "completions/mean_terminated_length": 134.046875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.5665549802010357, "grad_norm": 2.7890288829803467, "kl": 4.1103515625, "learning_rate": 1.735693561299373e-05, "loss": 0.1457, "num_tokens": 25962264.0, "reward": -1.3682861328125, "reward_std": 1.072068452835083, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.517822265625, "rewards/ppl_reward/std": 4.7098541259765625, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 134.984375, "completions/mean_terminated_length": 134.984375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.5677733780079195, "grad_norm": 2.4242022037506104, "kl": 4.3984375, "learning_rate": 1.7351166426010885e-05, "loss": 0.2585, "num_tokens": 25978055.0, "reward": -1.0635986328125, "reward_std": 0.41189777851104736, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.861572265625, "rewards/ppl_reward/std": 2.670377016067505, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 132.5, "completions/mean_terminated_length": 132.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.5689917758148035, "grad_norm": 2.6519649028778076, "kl": 5.685546875, "learning_rate": 1.7345391910549238e-05, "loss": 0.1466, "num_tokens": 25994207.0, "reward": -2.390380859375, "reward_std": 1.9408591985702515, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.35888671875, "rewards/ppl_reward/std": 9.648268699645996, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 123.171875, "completions/mean_terminated_length": 123.171875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.5702101736216876, "grad_norm": 3.0245072841644287, "kl": 7.26171875, "learning_rate": 1.7339612070794447e-05, "loss": 0.3518, "num_tokens": 26008866.0, "reward": -1.5128173828125, "reward_std": 1.7351067066192627, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -6.416259765625, "rewards/ppl_reward/std": 4.62325382232666, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 123.4375, "completions/mean_terminated_length": 123.4375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 1.5714285714285714, "grad_norm": 1.5430012941360474, "kl": 2.990234375, "learning_rate": 1.733382691093601e-05, "loss": -0.0226, "num_tokens": 26023830.0, "reward": -0.44085693359375, "reward_std": 1.0294550657272339, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.4285888671875, "rewards/ppl_reward/std": 3.601367950439453, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2605654299259186, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 134.046875, "completions/mean_terminated_length": 134.046875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.5726469692354552, "grad_norm": 2.8240535259246826, "kl": 4.158203125, "learning_rate": 1.732803643516729e-05, "loss": 0.1574, "num_tokens": 26040545.0, "reward": -3.8907470703125, "reward_std": 0.7588589191436768, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -11.500244140625, "rewards/ppl_reward/std": 12.086967468261719, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 134.546875, "completions/mean_terminated_length": 134.546875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.5738653670423393, "grad_norm": 1.9850127696990967, "kl": 6.546875, "learning_rate": 1.7322240647685503e-05, "loss": 0.274, "num_tokens": 26056084.0, "reward": -2.923583984375, "reward_std": 1.6422932147979736, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -9.33935546875, "rewards/ppl_reward/std": 9.79305648803711, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18496133387088776, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 128.640625, "completions/mean_terminated_length": 128.640625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.5750837648492233, "grad_norm": 2.979358196258545, "kl": 5.57421875, "learning_rate": 1.7316439552691714e-05, "loss": 0.2219, "num_tokens": 26071285.0, "reward": -1.5150146484375, "reward_std": 1.408776044845581, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.451904296875, "rewards/ppl_reward/std": 6.041647911071777, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.16347388923168182, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 124.625, "completions/mean_terminated_length": 124.625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.5763021626561073, "grad_norm": 2.590101957321167, "kl": 5.69921875, "learning_rate": 1.731063315439084e-05, "loss": 0.1638, "num_tokens": 26086245.0, "reward": -2.1484375, "reward_std": 1.0004868507385254, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -7.6953125, "rewards/ppl_reward/std": 2.8133320808410645, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.23408547043800354, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 166.015625, "completions/mean_terminated_length": 166.015625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 1.5775205604629912, "grad_norm": 2.2734031677246094, "kl": 7.28125, "learning_rate": 1.7304821456991634e-05, "loss": 0.3304, "num_tokens": 26105758.0, "reward": -1.81787109375, "reward_std": 1.0849251747131348, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -6.8623046875, "rewards/ppl_reward/std": 2.9176816940307617, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.23061636090278625, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 127.953125, "completions/mean_terminated_length": 127.953125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.578738958269875, "grad_norm": 3.6762571334838867, "kl": 4.578125, "learning_rate": 1.7299004464706692e-05, "loss": 0.1852, "num_tokens": 26121251.0, "reward": -1.45880126953125, "reward_std": 1.1315110921859741, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.3707275390625, "rewards/ppl_reward/std": 4.454535961151123, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.244862899184227, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 124.796875, "completions/mean_terminated_length": 124.796875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 1.579957356076759, "grad_norm": 2.17661190032959, "kl": 3.388671875, "learning_rate": 1.729318218175245e-05, "loss": 0.0517, "num_tokens": 26135790.0, "reward": -1.3150634765625, "reward_std": 0.7001090049743652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.239501953125, "rewards/ppl_reward/std": 3.751856565475464, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21578919887542725, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 130.5625, "completions/mean_terminated_length": 130.5625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.581175753883643, "grad_norm": 2.7839813232421875, "kl": 3.84375, "learning_rate": 1.728735461234918e-05, "loss": 0.1897, "num_tokens": 26151314.0, "reward": -3.8260498046875, "reward_std": 1.6050232648849487, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -11.191162109375, "rewards/ppl_reward/std": 11.972259521484375, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18225978314876556, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 141.890625, "completions/mean_terminated_length": 141.890625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.582394151690527, "grad_norm": 2.8673996925354004, "kl": 3.3515625, "learning_rate": 1.728152176072099e-05, "loss": 0.1191, "num_tokens": 26168091.0, "reward": -1.373046875, "reward_std": 0.8989815711975098, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.38671875, "rewards/ppl_reward/std": 2.866225242614746, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16347388923168182, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 141.078125, "completions/mean_terminated_length": 141.078125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.583612549497411, "grad_norm": 1.81771981716156, "kl": 2.669921875, "learning_rate": 1.72756836310958e-05, "loss": 0.1282, "num_tokens": 26184296.0, "reward": -0.91925048828125, "reward_std": 0.8746542930603027, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.5650634765625, "rewards/ppl_reward/std": 3.569465160369873, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 138.90625, "completions/mean_terminated_length": 138.90625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.5848309473042947, "grad_norm": 2.4113006591796875, "kl": 4.39453125, "learning_rate": 1.7269840227705375e-05, "loss": 0.1437, "num_tokens": 26200994.0, "reward": -1.8638916015625, "reward_std": 0.9132548570632935, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.243408203125, "rewards/ppl_reward/std": 5.153988838195801, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2366211861371994, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 131.546875, "completions/mean_terminated_length": 131.546875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.5860493451111788, "grad_norm": 2.0781781673431396, "kl": 3.91796875, "learning_rate": 1.726399155478529e-05, "loss": 0.0721, "num_tokens": 26216749.0, "reward": -1.736328125, "reward_std": 0.9969539642333984, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.02734375, "rewards/ppl_reward/std": 3.464653253555298, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2237938940525055, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 133.359375, "completions/mean_terminated_length": 133.359375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.5872677429180628, "grad_norm": 2.0130364894866943, "kl": 2.8515625, "learning_rate": 1.725813761657495e-05, "loss": 0.1325, "num_tokens": 26231740.0, "reward": -1.4193115234375, "reward_std": 0.4431605339050293, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.627685546875, "rewards/ppl_reward/std": 4.685102939605713, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 118.40625, "completions/mean_terminated_length": 118.40625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 1.5884861407249466, "grad_norm": 2.0803134441375732, "kl": 5.02734375, "learning_rate": 1.725227841731757e-05, "loss": 0.1645, "num_tokens": 26245494.0, "reward": -1.4688720703125, "reward_std": 1.6949126720428467, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.586181640625, "rewards/ppl_reward/std": 4.197292804718018, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2237938940525055, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 135.515625, "completions/mean_terminated_length": 135.515625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.5897045385318307, "grad_norm": 3.715940237045288, "kl": 6.833984375, "learning_rate": 1.7246413961260165e-05, "loss": 0.3597, "num_tokens": 26261495.0, "reward": -0.21392822265625, "reward_std": 0.5961331129074097, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.1153564453125, "rewards/ppl_reward/std": 1.1592122316360474, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17747680842876434, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 122.21875, "completions/mean_terminated_length": 122.21875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.5909229363387145, "grad_norm": 3.347926139831543, "kl": 3.9736328125, "learning_rate": 1.7240544252653596e-05, "loss": 0.0759, "num_tokens": 26276165.0, "reward": -2.5631103515625, "reward_std": 0.8188092112541199, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.821533203125, "rewards/ppl_reward/std": 7.426369667053223, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 120.625, "completions/mean_terminated_length": 120.625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 1.5921413341455986, "grad_norm": 4.829290390014648, "kl": 7.875, "learning_rate": 1.72346692957525e-05, "loss": 0.1541, "num_tokens": 26291301.0, "reward": -1.19677734375, "reward_std": 1.7133078575134277, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.7138671875, "rewards/ppl_reward/std": 3.9944934844970703, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.3083806037902832, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 122.03125, "completions/mean_terminated_length": 122.03125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.5933597319524826, "grad_norm": 2.54495906829834, "kl": 5.4072265625, "learning_rate": 1.7228789094815327e-05, "loss": 0.1868, "num_tokens": 26305423.0, "reward": -1.0689697265625, "reward_std": 0.9973824620246887, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.677001953125, "rewards/ppl_reward/std": 2.6400535106658936, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.19760315120220184, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 124.0, "completions/mean_terminated_length": 124.0, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.5945781297593664, "grad_norm": 1.8784692287445068, "kl": 4.8984375, "learning_rate": 1.722290365410433e-05, "loss": 0.1807, "num_tokens": 26320295.0, "reward": -1.2989501953125, "reward_std": 0.49501585960388184, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.332275390625, "rewards/ppl_reward/std": 4.801675319671631, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 125.734375, "completions/mean_terminated_length": 125.734375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 1.5957965275662502, "grad_norm": 2.055739641189575, "kl": 7.74609375, "learning_rate": 1.7217012977885556e-05, "loss": 0.3398, "num_tokens": 26335166.0, "reward": -1.4718017578125, "reward_std": 1.1626293659210205, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.435791015625, "rewards/ppl_reward/std": 2.922468662261963, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.20918723940849304, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 155.828125, "completions/mean_terminated_length": 155.828125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.5970149253731343, "grad_norm": 1.9769963026046753, "kl": 7.4453125, "learning_rate": 1.721111707042886e-05, "loss": 0.2914, "num_tokens": 26352827.0, "reward": -2.6295166015625, "reward_std": 0.9969530701637268, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -8.634033203125, "rewards/ppl_reward/std": 6.018672466278076, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.22493386268615723, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 115.9375, "completions/mean_terminated_length": 115.9375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.5982333231800183, "grad_norm": 1.5148428678512573, "kl": 4.43359375, "learning_rate": 1.720521593600787e-05, "loss": 0.0811, "num_tokens": 26366311.0, "reward": -2.53466796875, "reward_std": 2.395519733428955, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -8.4912109375, "rewards/ppl_reward/std": 8.396449089050293, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.25865477323532104, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 121.078125, "completions/mean_terminated_length": 121.078125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 1.5994517209869024, "grad_norm": 1.5688384771347046, "kl": 4.4921875, "learning_rate": 1.7199309578900017e-05, "loss": 0.117, "num_tokens": 26380436.0, "reward": -1.233154296875, "reward_std": 1.6379690170288086, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.11474609375, "rewards/ppl_reward/std": 5.08718729019165, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1846257895231247, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.6006701187937862, "grad_norm": 1.539827585220337, "kl": 4.23046875, "learning_rate": 1.7193398003386514e-05, "loss": -0.0131, "num_tokens": 26396604.0, "reward": -1.326904296875, "reward_std": 1.645730972290039, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.03662109375, "rewards/ppl_reward/std": 3.8738341331481934, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2627868354320526, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 145.40625, "completions/mean_terminated_length": 145.40625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.60188851660067, "grad_norm": 5.650953769683838, "kl": 5.45703125, "learning_rate": 1.7187481213752348e-05, "loss": 0.2214, "num_tokens": 26413502.0, "reward": -1.0548095703125, "reward_std": 1.0293571949005127, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.734619140625, "rewards/ppl_reward/std": 2.6097657680511475, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2592533528804779, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 136.8125, "completions/mean_terminated_length": 136.8125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.603106914407554, "grad_norm": 2.6675031185150146, "kl": 3.6484375, "learning_rate": 1.7181559214286298e-05, "loss": 0.1557, "num_tokens": 26429066.0, "reward": -3.6561279296875, "reward_std": 2.2563822269439697, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -10.937255859375, "rewards/ppl_reward/std": 10.455293655395508, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18898223340511322, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 136.46875, "completions/mean_terminated_length": 136.46875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.604325312214438, "grad_norm": 3.451993942260742, "kl": 4.47265625, "learning_rate": 1.7175632009280912e-05, "loss": 0.1712, "num_tokens": 26444872.0, "reward": -0.247314453125, "reward_std": 0.8516406416893005, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.06494140625, "rewards/ppl_reward/std": 1.9614262580871582, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1526367962360382, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 150.171875, "completions/mean_terminated_length": 150.171875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 1.6055437100213221, "grad_norm": 3.789762496948242, "kl": 3.1474609375, "learning_rate": 1.7169699603032516e-05, "loss": 0.2205, "num_tokens": 26461707.0, "reward": -0.431640625, "reward_std": 0.5735731720924377, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.51171875, "rewards/ppl_reward/std": 1.6673576831817627, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09827063977718353, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 135.640625, "completions/mean_terminated_length": 135.640625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 1.606762107828206, "grad_norm": 4.749217510223389, "kl": 2.666015625, "learning_rate": 1.7163761999841194e-05, "loss": 0.0152, "num_tokens": 26477412.0, "reward": -8.9920654296875, "reward_std": 11.95151138305664, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -21.577880859375, "rewards/ppl_reward/std": 64.44505310058594, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.23091863095760345, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 131.671875, "completions/mean_terminated_length": 131.671875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.6079805056350898, "grad_norm": 2.203186273574829, "kl": 4.12109375, "learning_rate": 1.715781920401082e-05, "loss": 0.0813, "num_tokens": 26492855.0, "reward": -2.9327392578125, "reward_std": 2.804314136505127, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -9.295166015625, "rewards/ppl_reward/std": 8.259450912475586, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.27433067560195923, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 134.65625, "completions/mean_terminated_length": 134.65625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 1.6091989034419738, "grad_norm": 2.1208014488220215, "kl": 5.78125, "learning_rate": 1.715187121984901e-05, "loss": 0.2468, "num_tokens": 26508393.0, "reward": -1.0361328125, "reward_std": 1.353055477142334, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.416015625, "rewards/ppl_reward/std": 3.0954837799072266, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.27094778418540955, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 130.078125, "completions/mean_terminated_length": 130.078125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 1.6104173012488578, "grad_norm": 2.6515049934387207, "kl": 6.3671875, "learning_rate": 1.7145918051667147e-05, "loss": 0.2087, "num_tokens": 26524006.0, "reward": -0.97705078125, "reward_std": 0.8123164176940918, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -5.3291015625, "rewards/ppl_reward/std": 2.329040765762329, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1883249133825302, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 126.890625, "completions/mean_terminated_length": 126.890625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.6116356990557417, "grad_norm": 4.5203022956848145, "kl": 5.998046875, "learning_rate": 1.7139959703780377e-05, "loss": 0.27, "num_tokens": 26539295.0, "reward": -1.5008544921875, "reward_std": 1.7443208694458008, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.572021484375, "rewards/ppl_reward/std": 7.937292575836182, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.24138814210891724, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 112.625, "completions/mean_terminated_length": 112.625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.6128540968626255, "grad_norm": 4.68159294128418, "kl": 5.55078125, "learning_rate": 1.7133996180507598e-05, "loss": 0.1649, "num_tokens": 26553239.0, "reward": -3.830322265625, "reward_std": 2.4823460578918457, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -11.09814453125, "rewards/ppl_reward/std": 15.295077323913574, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.2578144073486328, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 113.578125, "completions/mean_terminated_length": 113.578125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.6140724946695095, "grad_norm": 3.676283836364746, "kl": 3.0859375, "learning_rate": 1.7128027486171457e-05, "loss": 0.0958, "num_tokens": 26567324.0, "reward": -1.6932373046875, "reward_std": 0.703091025352478, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.081787109375, "rewards/ppl_reward/std": 4.574948310852051, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 116.375, "completions/mean_terminated_length": 116.375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.6152908924763936, "grad_norm": 2.1596016883850098, "kl": 4.4560546875, "learning_rate": 1.712205362509835e-05, "loss": 0.1787, "num_tokens": 26581748.0, "reward": -1.5296630859375, "reward_std": 1.1806331872940063, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.770263671875, "rewards/ppl_reward/std": 5.186867713928223, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 111.640625, "completions/mean_terminated_length": 111.640625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 1.6165092902832776, "grad_norm": 2.161146879196167, "kl": 3.7421875, "learning_rate": 1.7116074601618418e-05, "loss": 0.0378, "num_tokens": 26595653.0, "reward": -1.8109130859375, "reward_std": 1.341089129447937, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.262451171875, "rewards/ppl_reward/std": 3.9425253868103027, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 120.390625, "completions/mean_terminated_length": 120.390625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 1.6177276880901614, "grad_norm": 1.8296988010406494, "kl": 4.3466796875, "learning_rate": 1.7110090420065546e-05, "loss": 0.0996, "num_tokens": 26611038.0, "reward": -0.959228515625, "reward_std": 0.7082605361938477, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.49658203125, "rewards/ppl_reward/std": 2.539335250854492, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 107.1875, "completions/mean_terminated_length": 107.1875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.6189460858970453, "grad_norm": 1.658007025718689, "kl": 3.197265625, "learning_rate": 1.7104101084777353e-05, "loss": 0.0952, "num_tokens": 26624346.0, "reward": -4.6170654296875, "reward_std": 1.0736849308013916, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -12.991943359375, "rewards/ppl_reward/std": 11.635359764099121, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 112.484375, "completions/mean_terminated_length": 112.484375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.6201644837039293, "grad_norm": 2.0736329555511475, "kl": 4.85546875, "learning_rate": 1.7098106600095204e-05, "loss": 0.1863, "num_tokens": 26638049.0, "reward": -0.1533203125, "reward_std": 0.618861734867096, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.025390625, "rewards/ppl_reward/std": 1.5272283554077148, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1883249133825302, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 113.75, "completions/mean_terminated_length": 113.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.6213828815108133, "grad_norm": 2.280811309814453, "kl": 3.24609375, "learning_rate": 1.7092106970364185e-05, "loss": 0.1144, "num_tokens": 26652673.0, "reward": -0.1390380859375, "reward_std": 0.2706299424171448, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.114013671875, "rewards/ppl_reward/std": 1.8263397216796875, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 118.421875, "completions/mean_terminated_length": 118.421875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.6226012793176974, "grad_norm": 1.7232755422592163, "kl": 3.0, "learning_rate": 1.7086102199933116e-05, "loss": 0.1747, "num_tokens": 26667372.0, "reward": -1.672607421875, "reward_std": 0.4848828911781311, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.13427734375, "rewards/ppl_reward/std": 2.978688955307007, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 114.4375, "completions/mean_terminated_length": 114.4375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.6238196771245812, "grad_norm": 1.7185368537902832, "kl": 4.98046875, "learning_rate": 1.7080092293154543e-05, "loss": 0.2034, "num_tokens": 26681968.0, "reward": -1.4091796875, "reward_std": 1.232139229774475, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.466796875, "rewards/ppl_reward/std": 5.113056659698486, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1489359587430954, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 122.6875, "completions/mean_terminated_length": 122.6875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.625038074931465, "grad_norm": 2.622345447540283, "kl": 6.33984375, "learning_rate": 1.7074077254384743e-05, "loss": 0.3304, "num_tokens": 26696588.0, "reward": -0.9454345703125, "reward_std": 0.7726231217384338, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.523681640625, "rewards/ppl_reward/std": 2.036059856414795, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.17102740705013275, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 112.125, "completions/mean_terminated_length": 112.125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.626256472738349, "grad_norm": 2.2783069610595703, "kl": 5.6640625, "learning_rate": 1.7068057087983693e-05, "loss": 0.2844, "num_tokens": 26710524.0, "reward": -1.198974609375, "reward_std": 0.4474110007286072, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.17138671875, "rewards/ppl_reward/std": 2.5267786979675293, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 111.84375, "completions/mean_terminated_length": 111.84375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.627474870545233, "grad_norm": 2.9241623878479004, "kl": 6.30078125, "learning_rate": 1.70620317983151e-05, "loss": 0.2888, "num_tokens": 26724666.0, "reward": -1.2952880859375, "reward_std": 0.8079179525375366, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.285888671875, "rewards/ppl_reward/std": 3.5602385997772217, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1876239776611328, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 103.96875, "completions/mean_terminated_length": 103.96875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.628693268352117, "grad_norm": 3.2372167110443115, "kl": 4.26953125, "learning_rate": 1.70560013897464e-05, "loss": 0.1199, "num_tokens": 26738120.0, "reward": -0.59661865234375, "reward_std": 0.6451734900474548, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.8651123046875, "rewards/ppl_reward/std": 2.2628931999206543, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 105.828125, "completions/mean_terminated_length": 105.828125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 1.629911666159001, "grad_norm": 1.8185070753097534, "kl": 3.021484375, "learning_rate": 1.70499658666487e-05, "loss": 0.1435, "num_tokens": 26751285.0, "reward": -3.0419921875, "reward_std": 1.5606837272644043, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -9.849609375, "rewards/ppl_reward/std": 11.121241569519043, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 108.421875, "completions/mean_terminated_length": 108.421875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.6311300639658848, "grad_norm": 2.8929426670074463, "kl": 4.234375, "learning_rate": 1.7043925233396855e-05, "loss": 0.1713, "num_tokens": 26765440.0, "reward": -3.5867919921875, "reward_std": 0.886144757270813, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -10.861083984375, "rewards/ppl_reward/std": 16.61077117919922, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 108.53125, "completions/mean_terminated_length": 108.53125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 1.6323484617727688, "grad_norm": 7.15258264541626, "kl": 7.8359375, "learning_rate": 1.7037879494369398e-05, "loss": 0.3911, "num_tokens": 26778858.0, "reward": -0.96240234375, "reward_std": 0.8430618047714233, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.5029296875, "rewards/ppl_reward/std": 2.8248584270477295, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21114179491996765, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 116.921875, "completions/mean_terminated_length": 116.921875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.6335668595796529, "grad_norm": 1.9611170291900635, "kl": 4.00390625, "learning_rate": 1.7031828653948573e-05, "loss": 0.0914, "num_tokens": 26793909.0, "reward": -2.5841064453125, "reward_std": 0.3834063708782196, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.988525390625, "rewards/ppl_reward/std": 6.546685218811035, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 106.671875, "completions/mean_terminated_length": 106.671875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 1.6347852573865367, "grad_norm": 4.648008823394775, "kl": 7.51171875, "learning_rate": 1.7025772716520324e-05, "loss": 0.3619, "num_tokens": 26807464.0, "reward": -1.040771484375, "reward_std": 1.125571846961975, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.68310546875, "rewards/ppl_reward/std": 2.836158514022827, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2212863266468048, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 110.453125, "completions/mean_terminated_length": 110.453125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.6360036551934205, "grad_norm": 1.7479444742202759, "kl": 2.271484375, "learning_rate": 1.7019711686474277e-05, "loss": 0.0089, "num_tokens": 26821429.0, "reward": -0.526611328125, "reward_std": 0.30518293380737305, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.95947265625, "rewards/ppl_reward/std": 0.9812214374542236, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 107.8125, "completions/mean_terminated_length": 107.8125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.6372220530003045, "grad_norm": 2.096014976501465, "kl": 4.64453125, "learning_rate": 1.701364556820376e-05, "loss": 0.2224, "num_tokens": 26834401.0, "reward": -1.4486083984375, "reward_std": 1.5275784730911255, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.592529296875, "rewards/ppl_reward/std": 5.813434600830078, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 130.6875, "completions/mean_terminated_length": 130.6875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.6384404508071886, "grad_norm": 1.8607206344604492, "kl": 3.201171875, "learning_rate": 1.7007574366105795e-05, "loss": 0.0744, "num_tokens": 26850685.0, "reward": -2.0306396484375, "reward_std": 0.3716268539428711, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.889404296875, "rewards/ppl_reward/std": 5.771060943603516, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 115.59375, "completions/mean_terminated_length": 115.59375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.6396588486140726, "grad_norm": 2.812389612197876, "kl": 5.2724609375, "learning_rate": 1.7001498084581076e-05, "loss": 0.1727, "num_tokens": 26864531.0, "reward": -1.81475830078125, "reward_std": 0.8626348376274109, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.3170166015625, "rewards/ppl_reward/std": 4.862125873565674, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14689241349697113, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 112.984375, "completions/mean_terminated_length": 112.984375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.6408772464209564, "grad_norm": 7.325847148895264, "kl": 6.640625, "learning_rate": 1.699541672803398e-05, "loss": 0.2627, "num_tokens": 26878098.0, "reward": -2.43603515625, "reward_std": 1.849677324295044, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.4580078125, "rewards/ppl_reward/std": 7.638454914093018, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1953943371772766, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 117.9375, "completions/mean_terminated_length": 117.9375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 1.6420956442278403, "grad_norm": 2.1624677181243896, "kl": 3.837890625, "learning_rate": 1.6989330300872576e-05, "loss": 0.1618, "num_tokens": 26892318.0, "reward": -1.96966552734375, "reward_std": 1.1407619714736938, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.6580810546875, "rewards/ppl_reward/std": 7.22761344909668, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 142.703125, "completions/mean_terminated_length": 142.703125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.6433140420347243, "grad_norm": 2.3862764835357666, "kl": 8.25, "learning_rate": 1.6983238807508585e-05, "loss": 0.4557, "num_tokens": 26909107.0, "reward": -0.518798828125, "reward_std": 0.6298553943634033, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -4.61572265625, "rewards/ppl_reward/std": 1.6240332126617432, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 124.546875, "completions/mean_terminated_length": 124.546875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.6445324398416084, "grad_norm": 1.7657088041305542, "kl": 2.564453125, "learning_rate": 1.697714225235743e-05, "loss": 0.0763, "num_tokens": 26923718.0, "reward": -5.22021484375, "reward_std": 0.7312346696853638, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -14.3154296875, "rewards/ppl_reward/std": 24.230976104736328, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 128.3125, "completions/mean_terminated_length": 128.3125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.6457508376484924, "grad_norm": 2.399085283279419, "kl": 8.1484375, "learning_rate": 1.6971040639838175e-05, "loss": 0.3715, "num_tokens": 26938402.0, "reward": -0.7615966796875, "reward_std": 0.6477870345115662, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -4.999755859375, "rewards/ppl_reward/std": 3.0543179512023926, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1317027360200882, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 142.390625, "completions/mean_terminated_length": 142.390625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.6469692354553762, "grad_norm": 1.7106854915618896, "kl": 4.646484375, "learning_rate": 1.696493397437357e-05, "loss": 0.1983, "num_tokens": 26955091.0, "reward": -1.3555908203125, "reward_std": 1.0222294330596924, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.312744140625, "rewards/ppl_reward/std": 3.886323928833008, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 124.671875, "completions/mean_terminated_length": 124.671875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.64818763326226, "grad_norm": 1.3666045665740967, "kl": 3.798828125, "learning_rate": 1.6958822260390015e-05, "loss": 0.0815, "num_tokens": 26969550.0, "reward": -2.0987548828125, "reward_std": 0.8736672401428223, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.963134765625, "rewards/ppl_reward/std": 5.155953884124756, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1423913538455963, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 121.1875, "completions/mean_terminated_length": 121.1875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.649406031069144, "grad_norm": 1.8507089614868164, "kl": 2.91796875, "learning_rate": 1.6952705502317577e-05, "loss": 0.1161, "num_tokens": 26983970.0, "reward": -2.055419921875, "reward_std": 0.509856104850769, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.81396484375, "rewards/ppl_reward/std": 6.886270523071289, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1423913538455963, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 135.390625, "completions/mean_terminated_length": 135.390625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.6506244288760281, "grad_norm": 2.280735731124878, "kl": 7.92578125, "learning_rate": 1.6946583704589973e-05, "loss": 0.306, "num_tokens": 26999419.0, "reward": -2.9525146484375, "reward_std": 2.7779128551483154, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -9.420654296875, "rewards/ppl_reward/std": 11.535623550415039, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19654127955436707, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 149.25, "completions/mean_terminated_length": 149.25, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 1.651842826682912, "grad_norm": 1.858676552772522, "kl": 6.85546875, "learning_rate": 1.694045687164458e-05, "loss": 0.2549, "num_tokens": 27016827.0, "reward": -1.27606201171875, "reward_std": 1.619733452796936, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.9427490234375, "rewards/ppl_reward/std": 6.5899434089660645, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.19527530670166016, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 148.734375, "completions/mean_terminated_length": 148.734375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.6530612244897958, "grad_norm": 1.622072696685791, "kl": 6.6103515625, "learning_rate": 1.6934325007922418e-05, "loss": 0.3552, "num_tokens": 27033466.0, "reward": -3.5179443359375, "reward_std": 0.8182387351989746, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -10.621826171875, "rewards/ppl_reward/std": 16.447418212890625, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17390352487564087, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 146.5625, "completions/mean_terminated_length": 146.5625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 1.6542796222966798, "grad_norm": 2.356081962585449, "kl": 7.8046875, "learning_rate": 1.6928188117868157e-05, "loss": 0.3227, "num_tokens": 27049902.0, "reward": -3.1346435546875, "reward_std": 1.1768277883529663, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -9.542724609375, "rewards/ppl_reward/std": 6.044521331787109, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.27091917395591736, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 140.484375, "completions/mean_terminated_length": 140.484375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 1.6554980201035638, "grad_norm": 2.287654399871826, "kl": 4.720703125, "learning_rate": 1.6922046205930112e-05, "loss": 0.1808, "num_tokens": 27066429.0, "reward": -2.76220703125, "reward_std": 0.6583189964294434, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -9.0556640625, "rewards/ppl_reward/std": 9.90666389465332, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.2691108286380768, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 128.03125, "completions/mean_terminated_length": 128.03125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.6567164179104479, "grad_norm": 2.9145753383636475, "kl": 5.96484375, "learning_rate": 1.6915899276560237e-05, "loss": 0.1401, "num_tokens": 27081575.0, "reward": -4.29302978515625, "reward_std": 3.888434648513794, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -11.8360595703125, "rewards/ppl_reward/std": 12.647785186767578, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.3435921370983124, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 156.578125, "completions/mean_terminated_length": 156.578125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 1.6579348157173317, "grad_norm": 1.6153560876846313, "kl": 4.013671875, "learning_rate": 1.6909747334214116e-05, "loss": 0.062, "num_tokens": 27099988.0, "reward": -1.4267578125, "reward_std": 1.7559118270874023, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.291015625, "rewards/ppl_reward/std": 5.469139575958252, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.30860671401023865, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 138.515625, "completions/mean_terminated_length": 138.515625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 1.6591532135242155, "grad_norm": 2.028160572052002, "kl": 2.2470703125, "learning_rate": 1.6903590383350975e-05, "loss": 0.0817, "num_tokens": 27115685.0, "reward": -1.1468505859375, "reward_std": 0.6389906406402588, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.098388671875, "rewards/ppl_reward/std": 4.029764175415039, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 143.453125, "completions/mean_terminated_length": 143.453125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 1.6603716113310996, "grad_norm": 2.0751736164093018, "kl": 5.3515625, "learning_rate": 1.6897428428433675e-05, "loss": 0.3103, "num_tokens": 27132074.0, "reward": -3.7886962890625, "reward_std": 3.0296244621276855, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -11.061767578125, "rewards/ppl_reward/std": 10.651771545410156, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2236899733543396, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 153.421875, "completions/mean_terminated_length": 153.421875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.6615900091379836, "grad_norm": 1.530861496925354, "kl": 3.7890625, "learning_rate": 1.6891261473928685e-05, "loss": 0.0853, "num_tokens": 27149829.0, "reward": -0.917236328125, "reward_std": 0.6699618101119995, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.39697265625, "rewards/ppl_reward/std": 1.98178231716156, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.20412415266036987, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 145.65625, "completions/mean_terminated_length": 145.65625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 1.6628084069448676, "grad_norm": 2.0028603076934814, "kl": 6.2734375, "learning_rate": 1.688508952430612e-05, "loss": 0.2706, "num_tokens": 27165943.0, "reward": -0.6558837890625, "reward_std": 0.7621920108795166, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -4.819580078125, "rewards/ppl_reward/std": 1.9856765270233154, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2257249802350998, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 142.15625, "completions/mean_terminated_length": 142.15625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.6640268047517515, "grad_norm": 1.7767388820648193, "kl": 2.8603515625, "learning_rate": 1.6878912584039698e-05, "loss": 0.1017, "num_tokens": 27182313.0, "reward": -1.431884765625, "reward_std": 0.9900107383728027, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.55908203125, "rewards/ppl_reward/std": 3.100907325744629, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.2076999396085739, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 138.3125, "completions/mean_terminated_length": 138.3125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.6652452025586353, "grad_norm": 7.555438995361328, "kl": 3.392578125, "learning_rate": 1.6872730657606772e-05, "loss": 0.1044, "num_tokens": 27197677.0, "reward": -0.44171142578125, "reward_std": 0.6694244742393494, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.5552978515625, "rewards/ppl_reward/std": 2.2647860050201416, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 131.171875, "completions/mean_terminated_length": 131.171875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.6664636003655193, "grad_norm": 1.5836845636367798, "kl": 3.23046875, "learning_rate": 1.686654374948829e-05, "loss": 0.1455, "num_tokens": 27212704.0, "reward": -0.16943359375, "reward_std": 0.7961229085922241, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -3.8544921875, "rewards/ppl_reward/std": 1.5523127317428589, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20638974010944366, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 138.03125, "completions/mean_terminated_length": 138.03125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.6676819981724034, "grad_norm": 6.9134111404418945, "kl": 9.1875, "learning_rate": 1.6860351864168825e-05, "loss": 0.3868, "num_tokens": 27228226.0, "reward": -2.156982421875, "reward_std": 0.9464629888534546, "rewards/format_reward/mean": 0.296875, "rewards/format_reward/std": 0.4604927599430084, "rewards/ppl_reward/mean": -6.44677734375, "rewards/ppl_reward/std": 1.7084755897521973, "rewards/tag_count_reward/mean": 0.76953125, "rewards/tag_count_reward/std": 0.2103695124387741, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 137.515625, "completions/mean_terminated_length": 137.515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 1.6689003959792872, "grad_norm": 6.05731725692749, "kl": 9.3203125, "learning_rate": 1.685415500613655e-05, "loss": 0.3651, "num_tokens": 27244171.0, "reward": -2.293212890625, "reward_std": 1.1441280841827393, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -6.61767578125, "rewards/ppl_reward/std": 3.114283561706543, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.2592533528804779, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 154.046875, "completions/mean_terminated_length": 154.046875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.6701187937861712, "grad_norm": 8.982366561889648, "kl": 13.0859375, "learning_rate": 1.684795317988325e-05, "loss": 0.5301, "num_tokens": 27262118.0, "reward": -2.03729248046875, "reward_std": 1.5373756885528564, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -6.0042724609375, "rewards/ppl_reward/std": 5.792895793914795, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.29164010286331177, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 135.484375, "completions/mean_terminated_length": 135.484375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.671337191593055, "grad_norm": 4.312788009643555, "kl": 6.69140625, "learning_rate": 1.6841746389904306e-05, "loss": 0.1718, "num_tokens": 27277573.0, "reward": -2.4610595703125, "reward_std": 3.1163110733032227, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.5, "rewards/ppl_reward/mean": -7.375244140625, "rewards/ppl_reward/std": 11.017685890197754, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.2567298710346222, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 127.171875, "completions/mean_terminated_length": 127.171875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.672555589399939, "grad_norm": 2.564302921295166, "kl": 6.046875, "learning_rate": 1.6835534640698697e-05, "loss": 0.123, "num_tokens": 27292480.0, "reward": -2.3275146484375, "reward_std": 2.044985771179199, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.5, "rewards/ppl_reward/mean": -6.998779296875, "rewards/ppl_reward/std": 4.368070602416992, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.30820462107658386, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 120.625, "completions/mean_terminated_length": 120.625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 1.6737739872068231, "grad_norm": 1.8954726457595825, "kl": 2.86328125, "learning_rate": 1.6829317936768995e-05, "loss": 0.0554, "num_tokens": 27306632.0, "reward": -3.3306884765625, "reward_std": 3.672577381134033, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5029674172401428, "rewards/ppl_reward/mean": -9.270751953125, "rewards/ppl_reward/std": 8.437883377075195, "rewards/tag_count_reward/mean": 0.7734375, "rewards/tag_count_reward/std": 0.28770697116851807, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 121.8125, "completions/mean_terminated_length": 121.8125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.674992385013707, "grad_norm": 2.9256556034088135, "kl": 2.5390625, "learning_rate": 1.6823096282621366e-05, "loss": -0.0022, "num_tokens": 27321628.0, "reward": -1.13336181640625, "reward_std": 1.8164622783660889, "rewards/format_reward/mean": 0.578125, "rewards/format_reward/std": 0.49776285886764526, "rewards/ppl_reward/mean": -4.9698486328125, "rewards/ppl_reward/std": 4.0739521980285645, "rewards/tag_count_reward/mean": 0.7734375, "rewards/tag_count_reward/std": 0.3077012598514557, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 136.015625, "completions/mean_terminated_length": 136.015625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 1.6762107828205908, "grad_norm": 2.273477554321289, "kl": 3.99609375, "learning_rate": 1.6816869682765564e-05, "loss": 0.1206, "num_tokens": 27337749.0, "reward": -2.9102783203125, "reward_std": 2.0055360794067383, "rewards/format_reward/mean": 0.578125, "rewards/format_reward/std": 0.49776285886764526, "rewards/ppl_reward/mean": -8.609619140625, "rewards/ppl_reward/std": 7.188587665557861, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.2996101677417755, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 123.53125, "completions/mean_terminated_length": 123.53125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 1.6774291806274748, "grad_norm": 2.308366537094116, "kl": 2.921875, "learning_rate": 1.6810638141714933e-05, "loss": 0.0593, "num_tokens": 27352543.0, "reward": -3.020751953125, "reward_std": 2.3083858489990234, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.48795005679130554, "rewards/ppl_reward/mean": -8.93212890625, "rewards/ppl_reward/std": 6.306028842926025, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.2538151443004608, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 122.015625, "completions/mean_terminated_length": 122.015625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.6786475784343589, "grad_norm": 2.3323681354522705, "kl": 3.4765625, "learning_rate": 1.680440166398639e-05, "loss": 0.1025, "num_tokens": 27366968.0, "reward": -5.61083984375, "reward_std": 3.9179043769836426, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -14.4404296875, "rewards/ppl_reward/std": 20.456327438354492, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.23091863095760345, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 117.265625, "completions/mean_terminated_length": 117.265625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.679865976241243, "grad_norm": 3.640235424041748, "kl": 3.6640625, "learning_rate": 1.6798160254100433e-05, "loss": 0.1959, "num_tokens": 27380593.0, "reward": -1.60003662109375, "reward_std": 0.9614405632019043, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -6.5828857421875, "rewards/ppl_reward/std": 3.5235702991485596, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.18191926181316376, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 118.296875, "completions/mean_terminated_length": 118.296875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.6810843740481267, "grad_norm": 3.082345724105835, "kl": 6.0234375, "learning_rate": 1.679191391658114e-05, "loss": 0.2522, "num_tokens": 27394852.0, "reward": -1.795654296875, "reward_std": 1.43777334690094, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.48795005679130554, "rewards/ppl_reward/mean": -6.59130859375, "rewards/ppl_reward/std": 3.863089084625244, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.22712838649749756, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 129.703125, "completions/mean_terminated_length": 129.703125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.6823027718550105, "grad_norm": 2.6869492530822754, "kl": 7.1953125, "learning_rate": 1.678566265595615e-05, "loss": 0.3798, "num_tokens": 27410633.0, "reward": -6.04443359375, "reward_std": 2.5918476581573486, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.48795005679130554, "rewards/ppl_reward/mean": -15.0888671875, "rewards/ppl_reward/std": 17.584436416625977, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2136233150959015, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 142.8125, "completions/mean_terminated_length": 142.8125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 1.6835211696618946, "grad_norm": 2.024259090423584, "kl": 8.2734375, "learning_rate": 1.677940647675668e-05, "loss": 0.4725, "num_tokens": 27427149.0, "reward": -2.9185791015625, "reward_std": 1.5755788087844849, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -9.079345703125, "rewards/ppl_reward/std": 7.726041793823242, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.2395833432674408, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 127.140625, "completions/mean_terminated_length": 127.140625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.6847395674687786, "grad_norm": 3.8986246585845947, "kl": 6.875, "learning_rate": 1.677314538351751e-05, "loss": 0.3017, "num_tokens": 27442526.0, "reward": -1.3870849609375, "reward_std": 0.9774640202522278, "rewards/format_reward/mean": 0.609375, "rewards/format_reward/std": 0.4917473793029785, "rewards/ppl_reward/mean": -5.742919921875, "rewards/ppl_reward/std": 2.4419808387756348, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.19920477271080017, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 129.28125, "completions/mean_terminated_length": 129.28125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.6859579652756627, "grad_norm": 2.4054977893829346, "kl": 4.53125, "learning_rate": 1.6766879380776983e-05, "loss": 0.1477, "num_tokens": 27457976.0, "reward": -1.02288818359375, "reward_std": 0.5796988606452942, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.6082763671875, "rewards/ppl_reward/std": 4.255792140960693, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 133.9375, "completions/mean_terminated_length": 133.9375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.6871763630825465, "grad_norm": 2.117205858230591, "kl": 7.388671875, "learning_rate": 1.6760608473076997e-05, "loss": 0.3881, "num_tokens": 27473868.0, "reward": -2.1031494140625, "reward_std": 1.3016812801361084, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -7.550048828125, "rewards/ppl_reward/std": 5.193146228790283, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.24750742316246033, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 118.015625, "completions/mean_terminated_length": 118.015625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.6883947608894303, "grad_norm": 2.781897783279419, "kl": 5.35546875, "learning_rate": 1.6754332664963006e-05, "loss": 0.2325, "num_tokens": 27488389.0, "reward": -0.7718505859375, "reward_std": 0.5312595367431641, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.223388671875, "rewards/ppl_reward/std": 1.75965416431427, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1249379813671112, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 127.40625, "completions/mean_terminated_length": 127.40625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 1.6896131586963143, "grad_norm": 2.10105562210083, "kl": 4.087890625, "learning_rate": 1.674805196098402e-05, "loss": 0.144, "num_tokens": 27504015.0, "reward": -1.3148193359375, "reward_std": 0.5563551187515259, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.309326171875, "rewards/ppl_reward/std": 4.284334182739258, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 117.234375, "completions/mean_terminated_length": 117.234375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.6908315565031984, "grad_norm": 2.114309072494507, "kl": 3.3046875, "learning_rate": 1.6741766365692597e-05, "loss": 0.0764, "num_tokens": 27518366.0, "reward": -2.579345703125, "reward_std": 0.6425302028656006, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.94775390625, "rewards/ppl_reward/std": 8.27846908569336, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 107.8125, "completions/mean_terminated_length": 107.8125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.6920499543100822, "grad_norm": 1.9843106269836426, "kl": 4.505859375, "learning_rate": 1.6735475883644833e-05, "loss": 0.211, "num_tokens": 27531474.0, "reward": -1.5888671875, "reward_std": 0.5945994853973389, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.865234375, "rewards/ppl_reward/std": 4.042819499969482, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 116.25, "completions/mean_terminated_length": 116.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 1.693268352116966, "grad_norm": 3.9676902294158936, "kl": 6.1953125, "learning_rate": 1.6729180519400377e-05, "loss": 0.1619, "num_tokens": 27545258.0, "reward": -3.425537109375, "reward_std": 4.349576950073242, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -10.39794921875, "rewards/ppl_reward/std": 22.992023468017578, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2366211861371994, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 112.515625, "completions/mean_terminated_length": 112.515625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.69448674992385, "grad_norm": 2.1873865127563477, "kl": 4.875, "learning_rate": 1.6722880277522408e-05, "loss": 0.1594, "num_tokens": 27558499.0, "reward": -1.5792236328125, "reward_std": 1.3785531520843506, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.752197265625, "rewards/ppl_reward/std": 5.003735065460205, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18898223340511322, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 136.53125, "completions/mean_terminated_length": 136.53125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.695705147730734, "grad_norm": 2.0493600368499756, "kl": 4.2802734375, "learning_rate": 1.6716575162577647e-05, "loss": 0.1574, "num_tokens": 27575101.0, "reward": -3.46923828125, "reward_std": 0.6107962727546692, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -10.6572265625, "rewards/ppl_reward/std": 14.506043434143066, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 140.984375, "completions/mean_terminated_length": 140.984375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 1.6969235455376181, "grad_norm": 1.9716286659240723, "kl": 6.091796875, "learning_rate": 1.671026517913634e-05, "loss": 0.3788, "num_tokens": 27591124.0, "reward": -1.40484619140625, "reward_std": 1.3501029014587402, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.4190673828125, "rewards/ppl_reward/std": 5.019806385040283, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.2203386276960373, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 125.5, "completions/mean_terminated_length": 125.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 1.698141943344502, "grad_norm": 2.8499298095703125, "kl": 7.671875, "learning_rate": 1.670395033177227e-05, "loss": 0.3624, "num_tokens": 27605980.0, "reward": -1.339599609375, "reward_std": 1.2213172912597656, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.10107421875, "rewards/ppl_reward/std": 4.007601737976074, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.25479042530059814, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 127.609375, "completions/mean_terminated_length": 127.609375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 1.6993603411513858, "grad_norm": 2.511692523956299, "kl": 4.109375, "learning_rate": 1.6697630625062745e-05, "loss": 0.2043, "num_tokens": 27621147.0, "reward": -3.515380859375, "reward_std": 3.5687906742095947, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -10.67919921875, "rewards/ppl_reward/std": 20.318824768066406, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1846257895231247, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 132.828125, "completions/mean_terminated_length": 132.828125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.7005787389582698, "grad_norm": 4.157810211181641, "kl": 3.57421875, "learning_rate": 1.6691306063588583e-05, "loss": 0.0999, "num_tokens": 27636608.0, "reward": -2.3154296875, "reward_std": 0.6567988395690918, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.419921875, "rewards/ppl_reward/std": 7.74949836730957, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 140.109375, "completions/mean_terminated_length": 140.109375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.7017971367651539, "grad_norm": 1.3548369407653809, "kl": 3.958984375, "learning_rate": 1.6684976651934135e-05, "loss": 0.1437, "num_tokens": 27653439.0, "reward": -2.0523681640625, "reward_std": 1.770259141921997, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.839111328125, "rewards/ppl_reward/std": 8.732404708862305, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 128.1875, "completions/mean_terminated_length": 128.1875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.703015534572038, "grad_norm": 1.9058690071105957, "kl": 3.38671875, "learning_rate": 1.667864239468727e-05, "loss": 0.1089, "num_tokens": 27668419.0, "reward": -4.484619140625, "reward_std": 2.459991216659546, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -12.74267578125, "rewards/ppl_reward/std": 16.601409912109375, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 132.46875, "completions/mean_terminated_length": 132.46875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.7042339323789217, "grad_norm": 9.987963676452637, "kl": 5.3515625, "learning_rate": 1.667230329643935e-05, "loss": 0.3157, "num_tokens": 27683801.0, "reward": -0.916259765625, "reward_std": 0.5800584554672241, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.58251953125, "rewards/ppl_reward/std": 2.689056158065796, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 145.171875, "completions/mean_terminated_length": 145.171875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 1.7054523301858056, "grad_norm": 1.5095990896224976, "kl": 4.0546875, "learning_rate": 1.6665959361785265e-05, "loss": 0.1852, "num_tokens": 27700556.0, "reward": -0.8101806640625, "reward_std": 0.5448265671730042, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.362548828125, "rewards/ppl_reward/std": 1.9730976819992065, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17354662716388702, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 132.28125, "completions/mean_terminated_length": 132.28125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 1.7066707279926896, "grad_norm": 1.6165794134140015, "kl": 3.2392578125, "learning_rate": 1.6659610595323405e-05, "loss": 0.1401, "num_tokens": 27715974.0, "reward": -1.81396484375, "reward_std": 1.1614153385162354, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.3857421875, "rewards/ppl_reward/std": 7.2294840812683105, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18662993609905243, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 141.515625, "completions/mean_terminated_length": 141.515625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.7078891257995736, "grad_norm": 2.121609926223755, "kl": 6.73046875, "learning_rate": 1.6653257001655652e-05, "loss": 0.2853, "num_tokens": 27732119.0, "reward": -2.78662109375, "reward_std": 2.5797204971313477, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -9.1357421875, "rewards/ppl_reward/std": 12.7086820602417, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.21764887869358063, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 131.03125, "completions/mean_terminated_length": 131.03125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.7091075236064575, "grad_norm": 1.6004929542541504, "kl": 3.61328125, "learning_rate": 1.6646898585387413e-05, "loss": 0.1201, "num_tokens": 27747641.0, "reward": -1.70556640625, "reward_std": 1.0976834297180176, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.1455078125, "rewards/ppl_reward/std": 6.522800445556641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 125.640625, "completions/mean_terminated_length": 125.640625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 1.7103259214133415, "grad_norm": 1.8018732070922852, "kl": 2.7978515625, "learning_rate": 1.6640535351127557e-05, "loss": 0.1015, "num_tokens": 27762330.0, "reward": -1.554443359375, "reward_std": 0.33865827322006226, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.99951171875, "rewards/ppl_reward/std": 3.7564563751220703, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 129.953125, "completions/mean_terminated_length": 129.953125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.7115443192202253, "grad_norm": 2.1748485565185547, "kl": 2.4853515625, "learning_rate": 1.6634167303488467e-05, "loss": 0.0723, "num_tokens": 27777799.0, "reward": -1.3984375, "reward_std": 0.3076837360858917, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.7265625, "rewards/ppl_reward/std": 3.946579694747925, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 149.171875, "completions/mean_terminated_length": 149.171875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.7127627170271094, "grad_norm": 2.1022753715515137, "kl": 6.9375, "learning_rate": 1.6627794447086013e-05, "loss": 0.3241, "num_tokens": 27795178.0, "reward": -3.241943359375, "reward_std": 1.0788700580596924, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -10.16357421875, "rewards/ppl_reward/std": 12.450404167175293, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17354662716388702, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 132.40625, "completions/mean_terminated_length": 132.40625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 1.7139811148339934, "grad_norm": 2.153477191925049, "kl": 3.48046875, "learning_rate": 1.6621416786539545e-05, "loss": 0.1105, "num_tokens": 27810204.0, "reward": -1.2386474609375, "reward_std": 0.45041266083717346, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.375732421875, "rewards/ppl_reward/std": 2.8691394329071045, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 146.53125, "completions/mean_terminated_length": 146.53125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.7151995126408772, "grad_norm": 1.6223537921905518, "kl": 4.56640625, "learning_rate": 1.66150343264719e-05, "loss": 0.174, "num_tokens": 27827702.0, "reward": -1.2332763671875, "reward_std": 1.0248215198516846, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.107177734375, "rewards/ppl_reward/std": 3.4469432830810547, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 119.3125, "completions/mean_terminated_length": 119.3125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 1.716417910447761, "grad_norm": 1.399727463722229, "kl": 2.26171875, "learning_rate": 1.6608647071509392e-05, "loss": 0.0306, "num_tokens": 27841962.0, "reward": -4.6724853515625, "reward_std": 1.0605854988098145, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -13.173095703125, "rewards/ppl_reward/std": 18.78901481628418, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 127.25, "completions/mean_terminated_length": 127.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.717636308254645, "grad_norm": 1.6682069301605225, "kl": 2.7431640625, "learning_rate": 1.6602255026281808e-05, "loss": 0.1306, "num_tokens": 27856866.0, "reward": -1.98040771484375, "reward_std": 0.3745851516723633, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.8280029296875, "rewards/ppl_reward/std": 7.302960395812988, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09241779148578644, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 122.984375, "completions/mean_terminated_length": 122.984375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.7188547060615291, "grad_norm": 1.6809808015823364, "kl": 4.1162109375, "learning_rate": 1.6595858195422414e-05, "loss": 0.1589, "num_tokens": 27871401.0, "reward": -1.41064453125, "reward_std": 0.8123223781585693, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.5087890625, "rewards/ppl_reward/std": 4.255673885345459, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 113.78125, "completions/mean_terminated_length": 113.78125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.7200731038684132, "grad_norm": 1.9226628541946411, "kl": 3.03515625, "learning_rate": 1.6589456583567934e-05, "loss": 0.0903, "num_tokens": 27885027.0, "reward": -1.6632080078125, "reward_std": 0.4447917938232422, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.131103515625, "rewards/ppl_reward/std": 5.566122055053711, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 119.46875, "completions/mean_terminated_length": 119.46875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.721291501675297, "grad_norm": 2.0128567218780518, "kl": 4.640625, "learning_rate": 1.658305019535857e-05, "loss": 0.188, "num_tokens": 27899369.0, "reward": -1.8115234375, "reward_std": 0.5314149856567383, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.404296875, "rewards/ppl_reward/std": 4.7437968254089355, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 136.6875, "completions/mean_terminated_length": 136.6875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.7225098994821808, "grad_norm": 1.9285253286361694, "kl": 2.4462890625, "learning_rate": 1.6576639035437975e-05, "loss": 0.0552, "num_tokens": 27916653.0, "reward": -0.565185546875, "reward_std": 0.2392149567604065, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.01318359375, "rewards/ppl_reward/std": 3.058086633682251, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 120.890625, "completions/mean_terminated_length": 120.890625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.7237282972890648, "grad_norm": 5.0839762687683105, "kl": 2.66796875, "learning_rate": 1.657022310845327e-05, "loss": 0.0903, "num_tokens": 27930966.0, "reward": -0.5927734375, "reward_std": 0.48819437623023987, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.013671875, "rewards/ppl_reward/std": 2.7013022899627686, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 126.0625, "completions/mean_terminated_length": 126.0625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.724946695095949, "grad_norm": 1.4975146055221558, "kl": 3.8359375, "learning_rate": 1.6563802419055023e-05, "loss": 0.103, "num_tokens": 27945986.0, "reward": -2.72637939453125, "reward_std": 0.508044958114624, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -9.3043212890625, "rewards/ppl_reward/std": 11.69151496887207, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 128.0625, "completions/mean_terminated_length": 128.0625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.726165092902833, "grad_norm": 1.6818689107894897, "kl": 3.072265625, "learning_rate": 1.655737697189727e-05, "loss": 0.0899, "num_tokens": 27961798.0, "reward": -0.8223876953125, "reward_std": 0.5331453084945679, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.386962890625, "rewards/ppl_reward/std": 2.1619696617126465, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1249379813671112, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 122.15625, "completions/mean_terminated_length": 122.15625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.7273834907097168, "grad_norm": 1.5937368869781494, "kl": 3.275390625, "learning_rate": 1.6550946771637467e-05, "loss": 0.1399, "num_tokens": 27976592.0, "reward": -1.75970458984375, "reward_std": 0.4986012578010559, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.2772216796875, "rewards/ppl_reward/std": 4.871711254119873, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 120.03125, "completions/mean_terminated_length": 120.03125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.7286018885166006, "grad_norm": 2.1412453651428223, "kl": 5.974609375, "learning_rate": 1.654451182293654e-05, "loss": 0.2987, "num_tokens": 27990706.0, "reward": -3.140869140625, "reward_std": 2.1893322467803955, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -10.03955078125, "rewards/ppl_reward/std": 10.227564811706543, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 121.75, "completions/mean_terminated_length": 121.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.7298202863234846, "grad_norm": 1.9118741750717163, "kl": 5.4453125, "learning_rate": 1.6538072130458853e-05, "loss": 0.243, "num_tokens": 28005090.0, "reward": -1.1533203125, "reward_std": 1.4964277744293213, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.048828125, "rewards/ppl_reward/std": 5.2796454429626465, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 133.484375, "completions/mean_terminated_length": 133.484375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.7310386841303687, "grad_norm": 1.3456014394760132, "kl": 3.294921875, "learning_rate": 1.65316276988722e-05, "loss": 0.1371, "num_tokens": 28020937.0, "reward": -0.6368408203125, "reward_std": 0.41486918926239014, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.047119140625, "rewards/ppl_reward/std": 3.3439059257507324, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 133.015625, "completions/mean_terminated_length": 133.015625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.7322570819372525, "grad_norm": 2.7159712314605713, "kl": 6.8515625, "learning_rate": 1.6525178532847816e-05, "loss": 0.3096, "num_tokens": 28036450.0, "reward": -1.1170654296875, "reward_std": 0.974108874797821, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.835693359375, "rewards/ppl_reward/std": 4.50332498550415, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 133.03125, "completions/mean_terminated_length": 133.03125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 1.7334754797441365, "grad_norm": 1.3982750177383423, "kl": 2.572265625, "learning_rate": 1.6518724637060368e-05, "loss": 0.1289, "num_tokens": 28051596.0, "reward": -1.2149658203125, "reward_std": 0.3805837631225586, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.250244140625, "rewards/ppl_reward/std": 3.434316873550415, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 138.78125, "completions/mean_terminated_length": 138.78125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.7346938775510203, "grad_norm": 5.647111892700195, "kl": 11.76171875, "learning_rate": 1.6512266016187946e-05, "loss": 0.6112, "num_tokens": 28067102.0, "reward": -0.83709716796875, "reward_std": 1.4543437957763672, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.2132568359375, "rewards/ppl_reward/std": 4.984683513641357, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2257249802350998, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 125.09375, "completions/mean_terminated_length": 125.09375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 1.7359122753579044, "grad_norm": 2.318264961242676, "kl": 6.05078125, "learning_rate": 1.6505802674912074e-05, "loss": 0.2795, "num_tokens": 28081836.0, "reward": -0.963623046875, "reward_std": 1.371735692024231, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.61474609375, "rewards/ppl_reward/std": 6.608135223388672, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 136.9375, "completions/mean_terminated_length": 136.9375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 1.7371306731647884, "grad_norm": 2.045689344406128, "kl": 3.0048828125, "learning_rate": 1.6499334617917687e-05, "loss": 0.1386, "num_tokens": 28097592.0, "reward": -0.979248046875, "reward_std": 0.4831411838531494, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.76318359375, "rewards/ppl_reward/std": 1.9074413776397705, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 138.109375, "completions/mean_terminated_length": 138.109375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.7383490709716722, "grad_norm": 2.358821153640747, "kl": 5.859375, "learning_rate": 1.649286184989315e-05, "loss": 0.2671, "num_tokens": 28114079.0, "reward": -0.6470947265625, "reward_std": 0.6742070913314819, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.778564453125, "rewards/ppl_reward/std": 2.0012969970703125, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.18617255985736847, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 133.953125, "completions/mean_terminated_length": 133.953125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.739567468778556, "grad_norm": 3.1432578563690186, "kl": 6.4765625, "learning_rate": 1.648638437553023e-05, "loss": 0.2572, "num_tokens": 28130252.0, "reward": -0.9710693359375, "reward_std": 0.9326184988021851, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.403076171875, "rewards/ppl_reward/std": 3.2106590270996094, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.20918723940849304, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 157.203125, "completions/mean_terminated_length": 157.203125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.74078586658544, "grad_norm": 2.6011431217193604, "kl": 7.0234375, "learning_rate": 1.6479902199524116e-05, "loss": 0.3227, "num_tokens": 28148849.0, "reward": -1.68695068359375, "reward_std": 0.8870095610618591, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.9598388671875, "rewards/ppl_reward/std": 8.535407066345215, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1489359587430954, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 133.53125, "completions/mean_terminated_length": 133.53125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.7420042643923241, "grad_norm": 2.007965326309204, "kl": 6.0625, "learning_rate": 1.64734153265734e-05, "loss": 0.2606, "num_tokens": 28164467.0, "reward": -1.39404296875, "reward_std": 0.7198870182037354, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.2099609375, "rewards/ppl_reward/std": 3.546753168106079, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1508491188287735, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 120.4375, "completions/mean_terminated_length": 120.4375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.7432226621992082, "grad_norm": 2.0164687633514404, "kl": 3.6953125, "learning_rate": 1.6466923761380077e-05, "loss": 0.1956, "num_tokens": 28179143.0, "reward": -1.82568359375, "reward_std": 0.7799137830734253, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.4013671875, "rewards/ppl_reward/std": 6.394312858581543, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 122.671875, "completions/mean_terminated_length": 122.671875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.744441060006092, "grad_norm": 1.9567734003067017, "kl": 3.1591796875, "learning_rate": 1.6460427508649546e-05, "loss": 0.1112, "num_tokens": 28193858.0, "reward": -1.4346923828125, "reward_std": 0.45238661766052246, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.697509765625, "rewards/ppl_reward/std": 4.599045753479004, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 124.078125, "completions/mean_terminated_length": 124.078125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 1.7456594578129758, "grad_norm": 2.6538331508636475, "kl": 4.703125, "learning_rate": 1.6453926573090603e-05, "loss": 0.2021, "num_tokens": 28208991.0, "reward": -0.438720703125, "reward_std": 0.5991740226745605, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -4.40869140625, "rewards/ppl_reward/std": 2.3930516242980957, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17817416787147522, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 136.4375, "completions/mean_terminated_length": 136.4375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.7468778556198599, "grad_norm": 2.2914576530456543, "kl": 4.28125, "learning_rate": 1.6447420959415445e-05, "loss": 0.2531, "num_tokens": 28225179.0, "reward": -1.6654052734375, "reward_std": 0.5877448320388794, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.018310546875, "rewards/ppl_reward/std": 5.075520992279053, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18898223340511322, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 126.59375, "completions/mean_terminated_length": 126.59375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.748096253426744, "grad_norm": 4.069082736968994, "kl": 3.509765625, "learning_rate": 1.6440910672339647e-05, "loss": 0.2238, "num_tokens": 28240025.0, "reward": -1.5472412109375, "reward_std": 0.8366341590881348, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.867919921875, "rewards/ppl_reward/std": 4.8446831703186035, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 128.640625, "completions/mean_terminated_length": 128.640625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.7493146512336277, "grad_norm": 2.5603673458099365, "kl": 3.546875, "learning_rate": 1.6434395716582178e-05, "loss": 0.2179, "num_tokens": 28255618.0, "reward": -1.28460693359375, "reward_std": 1.6626579761505127, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.2567138671875, "rewards/ppl_reward/std": 7.162115573883057, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 135.609375, "completions/mean_terminated_length": 135.609375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.7505330490405118, "grad_norm": 1.9808133840560913, "kl": 7.3046875, "learning_rate": 1.6427876096865394e-05, "loss": 0.4372, "num_tokens": 28271449.0, "reward": -1.65496826171875, "reward_std": 0.9499474763870239, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.7318115234375, "rewards/ppl_reward/std": 6.501601696014404, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 146.203125, "completions/mean_terminated_length": 146.203125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.7517514468473956, "grad_norm": 4.154696464538574, "kl": 8.765625, "learning_rate": 1.6421351817915025e-05, "loss": 0.4668, "num_tokens": 28288086.0, "reward": -1.8094482421875, "reward_std": 1.3312673568725586, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -6.962646484375, "rewards/ppl_reward/std": 4.596372127532959, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.22493386268615723, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 125.3125, "completions/mean_terminated_length": 125.3125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.7529698446542796, "grad_norm": 3.8348464965820312, "kl": 9.1953125, "learning_rate": 1.641482288446019e-05, "loss": 0.3781, "num_tokens": 28303658.0, "reward": -4.01080322265625, "reward_std": 1.7705904245376587, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -11.2247314453125, "rewards/ppl_reward/std": 14.391815185546875, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.255761981010437, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 124.25, "completions/mean_terminated_length": 124.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.7541882424611637, "grad_norm": 3.1251137256622314, "kl": 9.1796875, "learning_rate": 1.6408289301233366e-05, "loss": 0.3469, "num_tokens": 28318514.0, "reward": -0.388427734375, "reward_std": 0.756803572177887, "rewards/format_reward/mean": 0.703125, "rewards/format_reward/std": 0.4604927599430084, "rewards/ppl_reward/mean": -4.01123046875, "rewards/ppl_reward/std": 1.0514402389526367, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 132.65625, "completions/mean_terminated_length": 132.65625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 1.7554066402680475, "grad_norm": 12.274822235107422, "kl": 11.15625, "learning_rate": 1.640175107297041e-05, "loss": 0.7426, "num_tokens": 28333508.0, "reward": -1.3497314453125, "reward_std": 1.1270781755447388, "rewards/format_reward/mean": 0.609375, "rewards/format_reward/std": 0.4917473793029785, "rewards/ppl_reward/mean": -5.699462890625, "rewards/ppl_reward/std": 2.8711705207824707, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.19352105259895325, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 112.390625, "completions/mean_terminated_length": 112.390625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 1.7566250380749313, "grad_norm": 3.648829460144043, "kl": 6.5234375, "learning_rate": 1.639520820441054e-05, "loss": 0.3449, "num_tokens": 28347101.0, "reward": -1.3070068359375, "reward_std": 1.4314424991607666, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -5.887451171875, "rewards/ppl_reward/std": 3.2532825469970703, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.18926911056041718, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 137.375, "completions/mean_terminated_length": 137.375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.7578434358818154, "grad_norm": 3.4111146926879883, "kl": 6.927734375, "learning_rate": 1.6388660700296353e-05, "loss": 0.3639, "num_tokens": 28363453.0, "reward": -0.563720703125, "reward_std": 0.5401644706726074, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.67431640625, "rewards/ppl_reward/std": 1.7195032835006714, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 118.609375, "completions/mean_terminated_length": 118.609375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.7590618336886994, "grad_norm": 6.550919532775879, "kl": 7.4453125, "learning_rate": 1.6382108565373783e-05, "loss": 0.4706, "num_tokens": 28377148.0, "reward": -3.4615478515625, "reward_std": 3.0632286071777344, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -10.188720703125, "rewards/ppl_reward/std": 7.5131025314331055, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2302463799715042, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 109.484375, "completions/mean_terminated_length": 109.484375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 1.7602802314955834, "grad_norm": 3.35496187210083, "kl": 3.3125, "learning_rate": 1.6375551804392142e-05, "loss": 0.1526, "num_tokens": 28390747.0, "reward": -2.5775146484375, "reward_std": 1.5692102909088135, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.811279296875, "rewards/ppl_reward/std": 8.760309219360352, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 125.34375, "completions/mean_terminated_length": 125.34375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 1.7614986293024673, "grad_norm": 3.8504183292388916, "kl": 4.45703125, "learning_rate": 1.6368990422104078e-05, "loss": 0.2204, "num_tokens": 28406241.0, "reward": -0.78564453125, "reward_std": 1.3644704818725586, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.0322265625, "rewards/ppl_reward/std": 4.900233268737793, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.14211894571781158, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 117.875, "completions/mean_terminated_length": 117.875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.762717027109351, "grad_norm": 2.293656587600708, "kl": 4.16796875, "learning_rate": 1.63624244232656e-05, "loss": 0.2071, "num_tokens": 28421137.0, "reward": -0.9605712890625, "reward_std": 0.4236800968647003, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.577392578125, "rewards/ppl_reward/std": 2.6733481884002686, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09449111670255661, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 112.046875, "completions/mean_terminated_length": 112.046875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 1.7639354249162351, "grad_norm": 6.077849864959717, "kl": 5.05078125, "learning_rate": 1.635585381263606e-05, "loss": 0.2307, "num_tokens": 28435308.0, "reward": -2.664306640625, "reward_std": 0.6484494805335999, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.99267578125, "rewards/ppl_reward/std": 7.279184818267822, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 130.4375, "completions/mean_terminated_length": 130.4375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.7651538227231192, "grad_norm": 4.004051685333252, "kl": 7.0859375, "learning_rate": 1.6349278594978147e-05, "loss": 0.4199, "num_tokens": 28451248.0, "reward": -4.1961669921875, "reward_std": 0.8276524543762207, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -11.939208984375, "rewards/ppl_reward/std": 16.258583068847656, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1508491188287735, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 124.109375, "completions/mean_terminated_length": 124.109375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.7663722205300032, "grad_norm": 4.720588684082031, "kl": 10.4609375, "learning_rate": 1.63426987750579e-05, "loss": 0.5558, "num_tokens": 28466135.0, "reward": -0.17108154296875, "reward_std": 0.5331648588180542, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -3.8656005859375, "rewards/ppl_reward/std": 1.8988181352615356, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1620931327342987, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 110.625, "completions/mean_terminated_length": 110.625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.767590618336887, "grad_norm": 2.3754539489746094, "kl": 4.3232421875, "learning_rate": 1.6336114357644693e-05, "loss": 0.1996, "num_tokens": 28479991.0, "reward": -1.0401611328125, "reward_std": 1.0104074478149414, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.791259765625, "rewards/ppl_reward/std": 4.595479488372803, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 111.34375, "completions/mean_terminated_length": 111.34375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.7688090161437708, "grad_norm": 2.433363199234009, "kl": 6.40625, "learning_rate": 1.632952534751122e-05, "loss": 0.36, "num_tokens": 28494229.0, "reward": -1.257080078125, "reward_std": 0.5791330933570862, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.17041015625, "rewards/ppl_reward/std": 1.8949270248413086, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 110.609375, "completions/mean_terminated_length": 110.609375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.7700274139506549, "grad_norm": 4.241726398468018, "kl": 7.484375, "learning_rate": 1.6322931749433512e-05, "loss": 0.2781, "num_tokens": 28507628.0, "reward": -4.0390625, "reward_std": 2.161590814590454, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -11.6328125, "rewards/ppl_reward/std": 8.357085227966309, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18496133387088776, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 131.921875, "completions/mean_terminated_length": 131.921875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 1.771245811757539, "grad_norm": 6.128734588623047, "kl": 13.44921875, "learning_rate": 1.6316333568190933e-05, "loss": 0.7263, "num_tokens": 28523559.0, "reward": -1.457275390625, "reward_std": 0.8941154479980469, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.33642578125, "rewards/ppl_reward/std": 3.3424363136291504, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2002912163734436, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 113.765625, "completions/mean_terminated_length": 113.765625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.7724642095644227, "grad_norm": 1.9212749004364014, "kl": 6.9775390625, "learning_rate": 1.630973080856616e-05, "loss": 0.4129, "num_tokens": 28537928.0, "reward": -0.8968505859375, "reward_std": 0.4783543050289154, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.535888671875, "rewards/ppl_reward/std": 2.4830477237701416, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1249379813671112, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 112.90625, "completions/mean_terminated_length": 112.90625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.7736826073713068, "grad_norm": 2.2627370357513428, "kl": 7.625, "learning_rate": 1.6303123475345182e-05, "loss": 0.4425, "num_tokens": 28551650.0, "reward": -1.12225341796875, "reward_std": 0.5714428424835205, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.8851318359375, "rewards/ppl_reward/std": 2.9880967140197754, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1110801100730896, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 107.125, "completions/mean_terminated_length": 107.125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.7749010051781906, "grad_norm": 1.4743269681930542, "kl": 3.673828125, "learning_rate": 1.6296511573317322e-05, "loss": 0.1618, "num_tokens": 28565218.0, "reward": -0.721923828125, "reward_std": 0.35796284675598145, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.29541015625, "rewards/ppl_reward/std": 2.345944404602051, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 120.953125, "completions/mean_terminated_length": 120.953125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 1.7761194029850746, "grad_norm": 2.454190731048584, "kl": 6.5703125, "learning_rate": 1.6289895107275203e-05, "loss": 0.344, "num_tokens": 28581127.0, "reward": -0.47662353515625, "reward_std": 0.7638903260231018, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.5938720703125, "rewards/ppl_reward/std": 2.297691822052002, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 112.34375, "completions/mean_terminated_length": 112.34375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.7773378007919587, "grad_norm": 1.9414817094802856, "kl": 4.380859375, "learning_rate": 1.628327408201475e-05, "loss": 0.1582, "num_tokens": 28595621.0, "reward": -1.472412109375, "reward_std": 1.8009796142578125, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.62451171875, "rewards/ppl_reward/std": 7.902655601501465, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1846257895231247, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 107.359375, "completions/mean_terminated_length": 107.359375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 1.7785561985988425, "grad_norm": 2.6435134410858154, "kl": 4.33984375, "learning_rate": 1.6276648502335207e-05, "loss": 0.1562, "num_tokens": 28609172.0, "reward": -1.917236328125, "reward_std": 1.2628018856048584, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.60791015625, "rewards/ppl_reward/std": 5.924127101898193, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 99.859375, "completions/mean_terminated_length": 99.859375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.7797745964057263, "grad_norm": 2.2570319175720215, "kl": 5.140625, "learning_rate": 1.6270018373039118e-05, "loss": 0.1784, "num_tokens": 28621755.0, "reward": -1.4949951171875, "reward_std": 0.7664299011230469, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.583740234375, "rewards/ppl_reward/std": 3.9318206310272217, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1836577206850052, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 135.171875, "completions/mean_terminated_length": 135.171875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 1.7809929942126104, "grad_norm": 1.9829106330871582, "kl": 6.78125, "learning_rate": 1.6263383698932307e-05, "loss": 0.3572, "num_tokens": 28638182.0, "reward": -0.35888671875, "reward_std": 0.6952450275421143, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.3037109375, "rewards/ppl_reward/std": 1.9388453960418701, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19024936854839325, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 121.15625, "completions/mean_terminated_length": 121.15625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.7822113920194944, "grad_norm": 2.666926145553589, "kl": 8.9140625, "learning_rate": 1.6256744484823912e-05, "loss": 0.4697, "num_tokens": 28652824.0, "reward": -1.3668212890625, "reward_std": 1.1037405729293823, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -6.116455078125, "rewards/ppl_reward/std": 4.316888809204102, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.13858474791049957, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 126.9375, "completions/mean_terminated_length": 126.9375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.7834297898263785, "grad_norm": 2.0567026138305664, "kl": 6.5234375, "learning_rate": 1.6250100735526354e-05, "loss": 0.2544, "num_tokens": 28668628.0, "reward": -1.05908203125, "reward_std": 0.9122035503387451, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.5947265625, "rewards/ppl_reward/std": 3.11655592918396, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.18191926181316376, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 114.4375, "completions/mean_terminated_length": 114.4375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 1.7846481876332623, "grad_norm": 2.6982696056365967, "kl": 5.8046875, "learning_rate": 1.624345245585534e-05, "loss": 0.2788, "num_tokens": 28682936.0, "reward": -2.4930419921875, "reward_std": 1.624558448791504, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.673583984375, "rewards/ppl_reward/std": 7.339099884033203, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 106.734375, "completions/mean_terminated_length": 106.734375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 1.785866585440146, "grad_norm": 2.6322896480560303, "kl": 5.40625, "learning_rate": 1.6236799650629863e-05, "loss": 0.1332, "num_tokens": 28696175.0, "reward": -1.3544921875, "reward_std": 1.7617826461791992, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.255859375, "rewards/ppl_reward/std": 5.519327163696289, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20638974010944366, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 115.203125, "completions/mean_terminated_length": 115.203125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.7870849832470301, "grad_norm": 2.677885055541992, "kl": 3.5703125, "learning_rate": 1.6230142324672198e-05, "loss": 0.0876, "num_tokens": 28710268.0, "reward": -0.7022705078125, "reward_std": 0.8997573256492615, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.099853515625, "rewards/ppl_reward/std": 1.8852330446243286, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 130.625, "completions/mean_terminated_length": 130.625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.7883033810539142, "grad_norm": 2.6009345054626465, "kl": 4.2119140625, "learning_rate": 1.6223480482807896e-05, "loss": 0.1886, "num_tokens": 28725828.0, "reward": -2.12451171875, "reward_std": 0.7982079982757568, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.9833984375, "rewards/ppl_reward/std": 6.87003231048584, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 118.140625, "completions/mean_terminated_length": 118.140625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.789521778860798, "grad_norm": 6.76277494430542, "kl": 4.140625, "learning_rate": 1.6216814129865772e-05, "loss": 0.1343, "num_tokens": 28740149.0, "reward": -1.1192626953125, "reward_std": 0.7664810419082642, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.972900390625, "rewards/ppl_reward/std": 3.609334945678711, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 120.109375, "completions/mean_terminated_length": 120.109375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.790740176667682, "grad_norm": 2.277430295944214, "kl": 5.0625, "learning_rate": 1.621014327067793e-05, "loss": 0.2053, "num_tokens": 28754468.0, "reward": -3.1424560546875, "reward_std": 1.6462063789367676, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -9.909912109375, "rewards/ppl_reward/std": 11.40173053741455, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17817416787147522, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 117.5625, "completions/mean_terminated_length": 117.5625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 1.7919585744745659, "grad_norm": 1.7158538103103638, "kl": 2.146484375, "learning_rate": 1.6203467910079722e-05, "loss": 0.0618, "num_tokens": 28768408.0, "reward": -1.110107421875, "reward_std": 0.5695501565933228, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.09521484375, "rewards/ppl_reward/std": 4.324396133422852, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 115.53125, "completions/mean_terminated_length": 115.53125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 1.79317697228145, "grad_norm": 1.969986915588379, "kl": 2.4404296875, "learning_rate": 1.6196788052909772e-05, "loss": 0.0785, "num_tokens": 28782106.0, "reward": -1.350341796875, "reward_std": 0.5246358513832092, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.52880859375, "rewards/ppl_reward/std": 4.094540596008301, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 119.609375, "completions/mean_terminated_length": 119.609375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.794395370088334, "grad_norm": 2.169217348098755, "kl": 5.6328125, "learning_rate": 1.619010370400996e-05, "loss": 0.1377, "num_tokens": 28796377.0, "reward": -2.986083984375, "reward_std": 2.3375260829925537, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -9.51123046875, "rewards/ppl_reward/std": 12.508949279785156, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.24947242438793182, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 130.15625, "completions/mean_terminated_length": 130.15625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 1.7956137678952178, "grad_norm": 2.0016086101531982, "kl": 3.97265625, "learning_rate": 1.6183414868225434e-05, "loss": 0.1182, "num_tokens": 28811651.0, "reward": -2.4500732421875, "reward_std": 0.791498601436615, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.595458984375, "rewards/ppl_reward/std": 6.434535026550293, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13768701255321503, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 122.046875, "completions/mean_terminated_length": 122.046875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 1.7968321657021016, "grad_norm": 6.8713154792785645, "kl": 10.875, "learning_rate": 1.617672155040457e-05, "loss": 0.4048, "num_tokens": 28825934.0, "reward": -1.258056640625, "reward_std": 1.0755963325500488, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.82861328125, "rewards/ppl_reward/std": 2.920440435409546, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.27094778418540955, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 125.078125, "completions/mean_terminated_length": 125.078125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.7980505635089856, "grad_norm": 2.2744054794311523, "kl": 4.89453125, "learning_rate": 1.6170023755399008e-05, "loss": 0.1629, "num_tokens": 28840459.0, "reward": -0.892822265625, "reward_std": 1.5288978815078735, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.36376953125, "rewards/ppl_reward/std": 4.403494358062744, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 137.65625, "completions/mean_terminated_length": 137.65625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 1.7992689613158697, "grad_norm": 1.8209470510482788, "kl": 4.044921875, "learning_rate": 1.6163321488063636e-05, "loss": 0.0409, "num_tokens": 28856461.0, "reward": -0.6343994140625, "reward_std": 0.9231512546539307, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.870361328125, "rewards/ppl_reward/std": 1.9665160179138184, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.21675680577754974, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 129.078125, "completions/mean_terminated_length": 129.078125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.8004873591227537, "grad_norm": 1.861891269683838, "kl": 5.58984375, "learning_rate": 1.6156614753256583e-05, "loss": 0.1988, "num_tokens": 28870954.0, "reward": -2.1729736328125, "reward_std": 1.5832219123840332, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.892822265625, "rewards/ppl_reward/std": 4.391798496246338, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.180765300989151, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 119.296875, "completions/mean_terminated_length": 119.296875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 1.8017057569296375, "grad_norm": 3.629528045654297, "kl": 5.6171875, "learning_rate": 1.614990355583921e-05, "loss": 0.1389, "num_tokens": 28884725.0, "reward": -4.3404541015625, "reward_std": 3.250195026397705, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -12.079345703125, "rewards/ppl_reward/std": 15.629781723022461, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.25046461820602417, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 139.78125, "completions/mean_terminated_length": 139.78125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.8029241547365213, "grad_norm": 2.8876047134399414, "kl": 3.58203125, "learning_rate": 1.6143187900676112e-05, "loss": 0.0673, "num_tokens": 28901039.0, "reward": -2.02734375, "reward_std": 0.8678293824195862, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.75, "rewards/ppl_reward/std": 5.494713306427002, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 151.1875, "completions/mean_terminated_length": 151.1875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.8041425525434054, "grad_norm": 1.7545932531356812, "kl": 3.37890625, "learning_rate": 1.613646779263512e-05, "loss": 0.0787, "num_tokens": 28918555.0, "reward": -1.5247802734375, "reward_std": 0.864012598991394, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.783935546875, "rewards/ppl_reward/std": 2.88922381401062, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.20638974010944366, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 159.375, "completions/mean_terminated_length": 159.375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.8053609503502894, "grad_norm": 2.0152230262756348, "kl": 7.109375, "learning_rate": 1.6129743236587293e-05, "loss": 0.4345, "num_tokens": 28936227.0, "reward": -0.6107177734375, "reward_std": 0.6250550746917725, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.822998046875, "rewards/ppl_reward/std": 1.9492831230163574, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2121305763721466, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 144.40625, "completions/mean_terminated_length": 144.40625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 1.8065793481571735, "grad_norm": 1.888916015625, "kl": 4.099609375, "learning_rate": 1.6123014237406912e-05, "loss": 0.1954, "num_tokens": 28952557.0, "reward": -1.634765625, "reward_std": 1.7855618000030518, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.94140625, "rewards/ppl_reward/std": 7.665105819702148, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 161.578125, "completions/mean_terminated_length": 161.578125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.8077977459640573, "grad_norm": 1.7062714099884033, "kl": 3.755859375, "learning_rate": 1.6116280799971477e-05, "loss": 0.1542, "num_tokens": 28970258.0, "reward": -1.134521484375, "reward_std": 0.8334439992904663, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.95654296875, "rewards/ppl_reward/std": 2.054703712463379, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.2182178944349289, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 153.359375, "completions/mean_terminated_length": 153.359375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.809016143770941, "grad_norm": 1.45244300365448, "kl": 3.37890625, "learning_rate": 1.6109542929161713e-05, "loss": 0.1155, "num_tokens": 28987841.0, "reward": -1.006103515625, "reward_std": 0.7469710111618042, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.72314453125, "rewards/ppl_reward/std": 2.295325517654419, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.244957834482193, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 132.875, "completions/mean_terminated_length": 132.875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.8102345415778252, "grad_norm": 2.5938446521759033, "kl": 4.4453125, "learning_rate": 1.6102800629861544e-05, "loss": 0.1469, "num_tokens": 29003465.0, "reward": -1.17266845703125, "reward_std": 1.0662603378295898, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.0250244140625, "rewards/ppl_reward/std": 4.905054569244385, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2237938940525055, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 134.578125, "completions/mean_terminated_length": 134.578125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.8114529393847092, "grad_norm": 3.9970757961273193, "kl": 5.185546875, "learning_rate": 1.6096053906958124e-05, "loss": 0.1971, "num_tokens": 29019038.0, "reward": -1.409423828125, "reward_std": 0.536023736000061, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.51416015625, "rewards/ppl_reward/std": 4.160738468170166, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 125.21875, "completions/mean_terminated_length": 125.21875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.812671337191593, "grad_norm": 3.006049633026123, "kl": 5.63671875, "learning_rate": 1.608930276534179e-05, "loss": 0.2403, "num_tokens": 29033604.0, "reward": -0.788330078125, "reward_std": 0.6390228867530823, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.27197265625, "rewards/ppl_reward/std": 3.4273929595947266, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 115.90625, "completions/mean_terminated_length": 115.90625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.813889734998477, "grad_norm": 1.8561581373214722, "kl": 4.8046875, "learning_rate": 1.60825472099061e-05, "loss": 0.1332, "num_tokens": 29047638.0, "reward": -3.37164306640625, "reward_std": 4.635988235473633, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -10.2589111328125, "rewards/ppl_reward/std": 23.851118087768555, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.23239074647426605, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 117.28125, "completions/mean_terminated_length": 117.28125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.8151081328053609, "grad_norm": 1.8885705471038818, "kl": 7.578125, "learning_rate": 1.6075787245547805e-05, "loss": 0.3224, "num_tokens": 29062104.0, "reward": -1.3316650390625, "reward_std": 0.9199609756469727, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.218017578125, "rewards/ppl_reward/std": 5.332767009735107, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.22736713290214539, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 113.953125, "completions/mean_terminated_length": 113.953125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 1.816326530612245, "grad_norm": 2.087296724319458, "kl": 6.28515625, "learning_rate": 1.6069022877166854e-05, "loss": 0.3793, "num_tokens": 29076117.0, "reward": -0.4561767578125, "reward_std": 1.1661920547485352, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.396728515625, "rewards/ppl_reward/std": 2.8398733139038086, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.3052735924720764, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 118.484375, "completions/mean_terminated_length": 118.484375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.817544928419129, "grad_norm": 2.470654010772705, "kl": 4.1259765625, "learning_rate": 1.6062254109666383e-05, "loss": 0.1944, "num_tokens": 29091052.0, "reward": -1.5765380859375, "reward_std": 1.5185524225234985, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.848388671875, "rewards/ppl_reward/std": 5.189780235290527, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.2215663492679596, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 113.53125, "completions/mean_terminated_length": 113.53125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.8187633262260128, "grad_norm": 1.725788950920105, "kl": 4.3837890625, "learning_rate": 1.605548094795272e-05, "loss": 0.2251, "num_tokens": 29105302.0, "reward": -2.123291015625, "reward_std": 1.195017695426941, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.87939453125, "rewards/ppl_reward/std": 4.458348274230957, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 118.015625, "completions/mean_terminated_length": 118.015625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.8199817240328966, "grad_norm": 3.6340579986572266, "kl": 10.765625, "learning_rate": 1.604870339693538e-05, "loss": 0.526, "num_tokens": 29119799.0, "reward": -2.4798583984375, "reward_std": 1.57926607131958, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -8.248779296875, "rewards/ppl_reward/std": 3.5024895668029785, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2439432591199875, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 106.296875, "completions/mean_terminated_length": 106.296875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 1.8212001218397806, "grad_norm": 2.163634777069092, "kl": 5.572265625, "learning_rate": 1.6041921461527054e-05, "loss": 0.3105, "num_tokens": 29133098.0, "reward": -1.46746826171875, "reward_std": 1.6251105070114136, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.5755615234375, "rewards/ppl_reward/std": 6.663662433624268, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 108.1875, "completions/mean_terminated_length": 108.1875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.8224185196466647, "grad_norm": 1.687705159187317, "kl": 4.6796875, "learning_rate": 1.6035135146643623e-05, "loss": 0.1602, "num_tokens": 29147542.0, "reward": -0.7113037109375, "reward_std": 1.0232148170471191, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.063232421875, "rewards/ppl_reward/std": 3.737685203552246, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 110.21875, "completions/mean_terminated_length": 110.21875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.8236369174535487, "grad_norm": 2.976576805114746, "kl": 6.6796875, "learning_rate": 1.6028344457204127e-05, "loss": 0.3048, "num_tokens": 29161628.0, "reward": -1.6959228515625, "reward_std": 0.5384366512298584, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.055908203125, "rewards/ppl_reward/std": 3.945117235183716, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14471296966075897, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 108.90625, "completions/mean_terminated_length": 108.90625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.8248553152604325, "grad_norm": 2.041236400604248, "kl": 5.06640625, "learning_rate": 1.6021549398130787e-05, "loss": 0.2266, "num_tokens": 29175134.0, "reward": -1.96630859375, "reward_std": 0.6899492740631104, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.6123046875, "rewards/ppl_reward/std": 4.376760482788086, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 97.765625, "completions/mean_terminated_length": 97.765625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.8260737130673164, "grad_norm": 2.0144217014312744, "kl": 6.0859375, "learning_rate": 1.6014749974348987e-05, "loss": 0.2172, "num_tokens": 29187783.0, "reward": -2.4390869140625, "reward_std": 2.5948634147644043, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.378173828125, "rewards/ppl_reward/std": 9.509974479675293, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.19352105259895325, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 109.109375, "completions/mean_terminated_length": 109.109375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.8272921108742004, "grad_norm": 2.0952224731445312, "kl": 3.5244140625, "learning_rate": 1.600794619078728e-05, "loss": 0.1454, "num_tokens": 29202270.0, "reward": -0.4437255859375, "reward_std": 0.4456550180912018, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.668701171875, "rewards/ppl_reward/std": 1.8469319343566895, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 113.625, "completions/mean_terminated_length": 113.625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 1.8285105086810844, "grad_norm": 1.608416199684143, "kl": 2.90234375, "learning_rate": 1.600113805237737e-05, "loss": 0.0766, "num_tokens": 29216718.0, "reward": -2.3480224609375, "reward_std": 1.9804853200912476, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.453857421875, "rewards/ppl_reward/std": 11.014394760131836, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 117.484375, "completions/mean_terminated_length": 117.484375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.8297289064879685, "grad_norm": 2.143908977508545, "kl": 3.3125, "learning_rate": 1.5994325564054122e-05, "loss": 0.2167, "num_tokens": 29231261.0, "reward": -0.504150390625, "reward_std": 0.20335820317268372, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -4.96142578125, "rewards/ppl_reward/std": 2.156738758087158, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 128.171875, "completions/mean_terminated_length": 128.171875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 1.8309473042948523, "grad_norm": 7.078126907348633, "kl": 6.880859375, "learning_rate": 1.5987508730755562e-05, "loss": 0.3612, "num_tokens": 29246576.0, "reward": -1.770263671875, "reward_std": 0.7760281562805176, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.19677734375, "rewards/ppl_reward/std": 4.875812530517578, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 117.875, "completions/mean_terminated_length": 117.875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.8321657021017361, "grad_norm": 2.8505959510803223, "kl": 8.453125, "learning_rate": 1.5980687557422854e-05, "loss": 0.4268, "num_tokens": 29260968.0, "reward": -2.35009765625, "reward_std": 0.9127241373062134, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.3330078125, "rewards/ppl_reward/std": 5.217050075531006, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 127.46875, "completions/mean_terminated_length": 127.46875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.8333840999086202, "grad_norm": 2.835301637649536, "kl": 9.052734375, "learning_rate": 1.5973862049000316e-05, "loss": 0.5145, "num_tokens": 29276342.0, "reward": -1.1226806640625, "reward_std": 0.976040244102478, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.854736328125, "rewards/ppl_reward/std": 3.6589417457580566, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 140.34375, "completions/mean_terminated_length": 140.34375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 1.8346024977155042, "grad_norm": 4.141074180603027, "kl": 9.8046875, "learning_rate": 1.5967032210435397e-05, "loss": 0.5034, "num_tokens": 29292572.0, "reward": -7.8265380859375, "reward_std": 4.260855197906494, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -19.168701171875, "rewards/ppl_reward/std": 30.85873794555664, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.21463678777217865, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 123.140625, "completions/mean_terminated_length": 123.140625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 1.835820895522388, "grad_norm": 2.7706685066223145, "kl": 5.80078125, "learning_rate": 1.59601980466787e-05, "loss": 0.339, "num_tokens": 29307125.0, "reward": -0.9132080078125, "reward_std": 0.5530534982681274, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.615478515625, "rewards/ppl_reward/std": 3.3400282859802246, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 156.0, "completions/mean_terminated_length": 156.0, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.8370392933292718, "grad_norm": 3.8121936321258545, "kl": 11.7265625, "learning_rate": 1.5953359562683953e-05, "loss": 0.6888, "num_tokens": 29325357.0, "reward": -0.533203125, "reward_std": 0.6453242301940918, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.62109375, "rewards/ppl_reward/std": 2.5035290718078613, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 140.796875, "completions/mean_terminated_length": 140.796875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.838257691136156, "grad_norm": 4.128809928894043, "kl": 8.390625, "learning_rate": 1.5946516763408015e-05, "loss": 0.6889, "num_tokens": 29341368.0, "reward": -1.476806640625, "reward_std": 0.6778621673583984, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.68017578125, "rewards/ppl_reward/std": 8.61412239074707, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 130.015625, "completions/mean_terminated_length": 130.015625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.83947608894304, "grad_norm": 2.6842591762542725, "kl": 9.1640625, "learning_rate": 1.5939669653810882e-05, "loss": 0.6482, "num_tokens": 29356289.0, "reward": -2.8626708984375, "reward_std": 0.8403571844100952, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -9.451904296875, "rewards/ppl_reward/std": 8.670188903808594, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.840694486749924, "grad_norm": 1.896399974822998, "kl": 5.9296875, "learning_rate": 1.5932818238855658e-05, "loss": 0.3147, "num_tokens": 29370345.0, "reward": -1.606689453125, "reward_std": 0.6188414096832275, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.06494140625, "rewards/ppl_reward/std": 4.669007301330566, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 123.40625, "completions/mean_terminated_length": 123.40625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.8419128845568078, "grad_norm": 3.9877021312713623, "kl": 5.458984375, "learning_rate": 1.592596252350859e-05, "loss": 0.2888, "num_tokens": 29385091.0, "reward": -0.87060546875, "reward_std": 0.8179070353507996, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.3818359375, "rewards/ppl_reward/std": 3.911470651626587, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 126.71875, "completions/mean_terminated_length": 126.71875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 1.8431312823636916, "grad_norm": 1.8934650421142578, "kl": 6.6484375, "learning_rate": 1.591910251273902e-05, "loss": 0.4116, "num_tokens": 29399617.0, "reward": -1.7181396484375, "reward_std": 0.977352499961853, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.108154296875, "rewards/ppl_reward/std": 4.5156707763671875, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13524486124515533, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 123.6875, "completions/mean_terminated_length": 123.6875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 1.8443496801705757, "grad_norm": 2.3838295936584473, "kl": 6.703125, "learning_rate": 1.5912238211519425e-05, "loss": 0.2233, "num_tokens": 29414821.0, "reward": -1.3482666015625, "reward_std": 1.029491662979126, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.173095703125, "rewards/ppl_reward/std": 3.2932701110839844, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2372427135705948, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 123.734375, "completions/mean_terminated_length": 123.734375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.8455680779774597, "grad_norm": 3.211815118789673, "kl": 7.62890625, "learning_rate": 1.5905369624825367e-05, "loss": 0.4226, "num_tokens": 29429364.0, "reward": -1.8756103515625, "reward_std": 0.8622369766235352, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.509033203125, "rewards/ppl_reward/std": 7.086119651794434, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 126.953125, "completions/mean_terminated_length": 126.953125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.8467864757843437, "grad_norm": 2.0843780040740967, "kl": 6.23046875, "learning_rate": 1.5898496757635538e-05, "loss": 0.357, "num_tokens": 29444017.0, "reward": -2.4439697265625, "reward_std": 0.7986663579940796, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.536376953125, "rewards/ppl_reward/std": 7.897687911987305, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15545432269573212, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 130.40625, "completions/mean_terminated_length": 130.40625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 1.8480048735912276, "grad_norm": 2.945274829864502, "kl": 7.2421875, "learning_rate": 1.5891619614931714e-05, "loss": 0.3351, "num_tokens": 29459395.0, "reward": -1.3629150390625, "reward_std": 0.8793420791625977, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.218017578125, "rewards/ppl_reward/std": 3.513202667236328, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2138771414756775, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 112.96875, "completions/mean_terminated_length": 112.96875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 1.8492232713981114, "grad_norm": 2.0903420448303223, "kl": 2.21875, "learning_rate": 1.588473820169879e-05, "loss": -0.0024, "num_tokens": 29473017.0, "reward": -1.407958984375, "reward_std": 1.008446216583252, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.60498046875, "rewards/ppl_reward/std": 5.173165798187256, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18662993609905243, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 129.328125, "completions/mean_terminated_length": 129.328125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 1.8504416692049954, "grad_norm": 1.8803093433380127, "kl": 5.5380859375, "learning_rate": 1.5877852522924733e-05, "loss": 0.221, "num_tokens": 29488174.0, "reward": -1.187255859375, "reward_std": 1.2867250442504883, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.09326171875, "rewards/ppl_reward/std": 5.065130233764648, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 122.375, "completions/mean_terminated_length": 122.375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.8516600670118795, "grad_norm": 2.4409139156341553, "kl": 4.76953125, "learning_rate": 1.5870962583600622e-05, "loss": 0.2327, "num_tokens": 29502582.0, "reward": -2.702392578125, "reward_std": 0.8480877876281738, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -9.19384765625, "rewards/ppl_reward/std": 4.721500396728516, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 117.34375, "completions/mean_terminated_length": 117.34375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 1.8528784648187633, "grad_norm": 2.703700065612793, "kl": 3.9765625, "learning_rate": 1.5864068388720613e-05, "loss": 0.0904, "num_tokens": 29516380.0, "reward": -1.8524169921875, "reward_std": 1.5119800567626953, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.345458984375, "rewards/ppl_reward/std": 6.8503031730651855, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 130.65625, "completions/mean_terminated_length": 130.65625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.8540968626256473, "grad_norm": 2.4523866176605225, "kl": 5.16015625, "learning_rate": 1.5857169943281948e-05, "loss": 0.142, "num_tokens": 29531462.0, "reward": -1.1849365234375, "reward_std": 0.7706807851791382, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.010498046875, "rewards/ppl_reward/std": 3.030492067337036, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1110801100730896, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 144.296875, "completions/mean_terminated_length": 144.296875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.8553152604325311, "grad_norm": 2.0854737758636475, "kl": 3.4609375, "learning_rate": 1.585026725228496e-05, "loss": 0.1049, "num_tokens": 29548089.0, "reward": -3.76513671875, "reward_std": 2.9246327877044678, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -11.2646484375, "rewards/ppl_reward/std": 19.062545776367188, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.8565336582394152, "grad_norm": 4.43815803527832, "kl": 10.0625, "learning_rate": 1.5843360320733042e-05, "loss": 0.441, "num_tokens": 29564433.0, "reward": -0.8287353515625, "reward_std": 1.4361639022827148, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.071533203125, "rewards/ppl_reward/std": 3.526254177093506, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.18729320168495178, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 145.5625, "completions/mean_terminated_length": 145.5625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 1.8577520560462992, "grad_norm": 1.9149163961410522, "kl": 5.00390625, "learning_rate": 1.583644915363267e-05, "loss": 0.1971, "num_tokens": 29580301.0, "reward": -3.0472412109375, "reward_std": 1.4844521284103394, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -9.711669921875, "rewards/ppl_reward/std": 6.1833953857421875, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21931616961956024, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 142.140625, "completions/mean_terminated_length": 142.140625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.858970453853183, "grad_norm": 1.9364837408065796, "kl": 3.87109375, "learning_rate": 1.58295337559934e-05, "loss": 0.1957, "num_tokens": 29596534.0, "reward": -1.92138671875, "reward_std": 1.46051025390625, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.5458984375, "rewards/ppl_reward/std": 6.757449626922607, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.20638974010944366, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 143.78125, "completions/mean_terminated_length": 143.78125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.8601888516600669, "grad_norm": 3.0078837871551514, "kl": 3.736328125, "learning_rate": 1.582261413282784e-05, "loss": 0.1801, "num_tokens": 29613416.0, "reward": -1.5401611328125, "reward_std": 0.645356297492981, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.861572265625, "rewards/ppl_reward/std": 4.417820453643799, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1298656165599823, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 136.140625, "completions/mean_terminated_length": 136.140625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.861407249466951, "grad_norm": 1.7608742713928223, "kl": 1.6298828125, "learning_rate": 1.581569028915166e-05, "loss": 0.0101, "num_tokens": 29628729.0, "reward": -5.8978271484375, "reward_std": 0.9566396474838257, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -15.662841796875, "rewards/ppl_reward/std": 22.49204444885254, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 158.125, "completions/mean_terminated_length": 158.125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 1.862625647273835, "grad_norm": 2.164902687072754, "kl": 4.1640625, "learning_rate": 1.5808762229983597e-05, "loss": 0.1707, "num_tokens": 29646009.0, "reward": -1.126953125, "reward_std": 0.9030681848526001, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.04296875, "rewards/ppl_reward/std": 5.08385705947876, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1550549566745758, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 136.984375, "completions/mean_terminated_length": 136.984375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.863844045080719, "grad_norm": 1.4792698621749878, "kl": 2.34765625, "learning_rate": 1.5801829960345445e-05, "loss": 0.0819, "num_tokens": 29661000.0, "reward": -3.048583984375, "reward_std": 0.6854549646377563, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -9.87841796875, "rewards/ppl_reward/std": 8.576088905334473, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 152.578125, "completions/mean_terminated_length": 152.578125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 1.8650624428876028, "grad_norm": 1.6569544076919556, "kl": 2.8154296875, "learning_rate": 1.5794893485262046e-05, "loss": 0.1147, "num_tokens": 29677853.0, "reward": -5.1795654296875, "reward_std": 1.3697608709335327, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -14.195068359375, "rewards/ppl_reward/std": 24.891624450683594, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 145.640625, "completions/mean_terminated_length": 145.640625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.8662808406944866, "grad_norm": 2.808476686477661, "kl": 5.421875, "learning_rate": 1.5787952809761286e-05, "loss": 0.2521, "num_tokens": 29693614.0, "reward": -1.774169921875, "reward_std": 0.582504153251648, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.23583984375, "rewards/ppl_reward/std": 4.5773162841796875, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 148.78125, "completions/mean_terminated_length": 148.78125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 1.8674992385013707, "grad_norm": 1.5850496292114258, "kl": 4.3681640625, "learning_rate": 1.5781007938874103e-05, "loss": 0.1745, "num_tokens": 29710240.0, "reward": -2.437744140625, "reward_std": 1.7363474369049072, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.58642578125, "rewards/ppl_reward/std": 10.440057754516602, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 153.921875, "completions/mean_terminated_length": 153.921875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 1.8687176363082547, "grad_norm": 1.619667649269104, "kl": 2.498046875, "learning_rate": 1.5774058877634473e-05, "loss": 0.152, "num_tokens": 29727387.0, "reward": -0.63232421875, "reward_std": 0.24105466902256012, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.1396484375, "rewards/ppl_reward/std": 2.6154160499572754, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 1.8699360341151388, "grad_norm": 2.7118966579437256, "kl": 7.89453125, "learning_rate": 1.576710563107941e-05, "loss": 0.5637, "num_tokens": 29744691.0, "reward": -3.3919677734375, "reward_std": 0.647517204284668, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -10.541748046875, "rewards/ppl_reward/std": 9.156161308288574, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 156.15625, "completions/mean_terminated_length": 156.15625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.8711544319220226, "grad_norm": 1.6963481903076172, "kl": 7.7587890625, "learning_rate": 1.5760148204248964e-05, "loss": 0.4551, "num_tokens": 29761341.0, "reward": -1.4068603515625, "reward_std": 0.5646791458129883, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.454345703125, "rewards/ppl_reward/std": 3.9842395782470703, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 147.671875, "completions/mean_terminated_length": 147.671875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 1.8723728297289064, "grad_norm": 2.4916701316833496, "kl": 4.3359375, "learning_rate": 1.5753186602186207e-05, "loss": 0.2683, "num_tokens": 29777360.0, "reward": -2.093017578125, "reward_std": 0.8342222571372986, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.99853515625, "rewards/ppl_reward/std": 4.06285285949707, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1298656165599823, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 149.234375, "completions/mean_terminated_length": 149.234375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.8735912275357904, "grad_norm": 2.0544540882110596, "kl": 7.2734375, "learning_rate": 1.5746220829937247e-05, "loss": 0.4732, "num_tokens": 29793503.0, "reward": -1.06201171875, "reward_std": 0.9776384234428406, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.8037109375, "rewards/ppl_reward/std": 4.451579570770264, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1791718602180481, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 174.625, "completions/mean_terminated_length": 161.1428680419922, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 1.8748096253426745, "grad_norm": 3.8056628704071045, "kl": 13.822265625, "learning_rate": 1.5739250892551207e-05, "loss": 0.9061, "num_tokens": 29812087.0, "reward": -0.551025390625, "reward_std": 0.6331583261489868, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.69580078125, "rewards/ppl_reward/std": 1.3654605150222778, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1836577206850052, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 126.015625, "completions/mean_terminated_length": 126.015625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.8760280231495583, "grad_norm": 1.6912648677825928, "kl": 3.572265625, "learning_rate": 1.573227679508024e-05, "loss": 0.1326, "num_tokens": 29826616.0, "reward": -1.0838623046875, "reward_std": 0.603778064250946, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.988037109375, "rewards/ppl_reward/std": 3.363929510116577, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 144.96875, "completions/mean_terminated_length": 144.96875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.8772464209564421, "grad_norm": 1.8732647895812988, "kl": 6.3203125, "learning_rate": 1.57252985425795e-05, "loss": 0.3793, "num_tokens": 29842406.0, "reward": -3.2161865234375, "reward_std": 2.126434087753296, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -10.158935546875, "rewards/ppl_reward/std": 10.921960830688477, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.17567719519138336, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 184.671875, "completions/mean_terminated_length": 184.671875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.8784648187633262, "grad_norm": 6.365846633911133, "kl": 18.34375, "learning_rate": 1.5718316140107156e-05, "loss": 1.0328, "num_tokens": 29861081.0, "reward": -2.0, "reward_std": 1.788710117340088, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -7.2109375, "rewards/ppl_reward/std": 5.218262195587158, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.2398419976234436, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 150.609375, "completions/mean_terminated_length": 150.609375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 1.8796832165702102, "grad_norm": 3.4836413860321045, "kl": 5.916015625, "learning_rate": 1.57113295927244e-05, "loss": 0.2045, "num_tokens": 29877808.0, "reward": -0.7919921875, "reward_std": 0.5080277919769287, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.240234375, "rewards/ppl_reward/std": 1.9404789209365845, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 148.796875, "completions/mean_terminated_length": 148.796875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.8809016143770942, "grad_norm": 2.9307661056518555, "kl": 9.5, "learning_rate": 1.5704338905495407e-05, "loss": 0.4276, "num_tokens": 29894619.0, "reward": -0.81689453125, "reward_std": 0.9439747929573059, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -4.9384765625, "rewards/ppl_reward/std": 2.0425753593444824, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.20225508511066437, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 140.65625, "completions/mean_terminated_length": 140.65625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 1.882120012183978, "grad_norm": 3.10518217086792, "kl": 6.90625, "learning_rate": 1.5697344083487372e-05, "loss": 0.2619, "num_tokens": 29910285.0, "reward": -1.9129638671875, "reward_std": 0.8654369711875916, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -7.185302734375, "rewards/ppl_reward/std": 3.6942899227142334, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.21693551540374756, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 143.421875, "completions/mean_terminated_length": 143.421875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.8833384099908619, "grad_norm": 1.9301244020462036, "kl": 3.455078125, "learning_rate": 1.5690345131770474e-05, "loss": 0.1262, "num_tokens": 29927320.0, "reward": -1.3853759765625, "reward_std": 0.6315932273864746, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.473876953125, "rewards/ppl_reward/std": 2.615523099899292, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 123.546875, "completions/mean_terminated_length": 123.546875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 1.884556807797746, "grad_norm": 1.9976563453674316, "kl": 2.3505859375, "learning_rate": 1.568334205541789e-05, "loss": -0.0047, "num_tokens": 29941131.0, "reward": -2.37335205078125, "reward_std": 1.5679683685302734, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.3482666015625, "rewards/ppl_reward/std": 8.336666107177734, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.17102740705013275, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 131.984375, "completions/mean_terminated_length": 131.984375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.88577520560463, "grad_norm": 3.1191089153289795, "kl": 3.16015625, "learning_rate": 1.5676334859505783e-05, "loss": 0.1775, "num_tokens": 29956138.0, "reward": -0.731201171875, "reward_std": 1.076396107673645, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.08740234375, "rewards/ppl_reward/std": 3.8907148838043213, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19416078925132751, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 134.3125, "completions/mean_terminated_length": 134.3125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 1.886993603411514, "grad_norm": 2.096986770629883, "kl": 2.861328125, "learning_rate": 1.5669323549113315e-05, "loss": 0.1329, "num_tokens": 29972022.0, "reward": -1.2894287109375, "reward_std": 0.6676523685455322, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.203857421875, "rewards/ppl_reward/std": 2.494889259338379, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14689241349697113, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 130.046875, "completions/mean_terminated_length": 130.046875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.8882120012183978, "grad_norm": 2.684504508972168, "kl": 3.16796875, "learning_rate": 1.566230812932261e-05, "loss": 0.0894, "num_tokens": 29987249.0, "reward": -0.64935302734375, "reward_std": 0.9831602573394775, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.8533935546875, "rewards/ppl_reward/std": 2.202643394470215, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 130.484375, "completions/mean_terminated_length": 130.484375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.8894303990252816, "grad_norm": 2.5208373069763184, "kl": 2.591796875, "learning_rate": 1.565528860521879e-05, "loss": 0.1222, "num_tokens": 30002672.0, "reward": -0.272705078125, "reward_std": 0.4627646803855896, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.27978515625, "rewards/ppl_reward/std": 1.3630850315093994, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1423913538455963, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 128.078125, "completions/mean_terminated_length": 128.078125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.8906487968321657, "grad_norm": 1.7840217351913452, "kl": 5.8671875, "learning_rate": 1.5648264981889936e-05, "loss": 0.1878, "num_tokens": 30017797.0, "reward": -5.3660888671875, "reward_std": 2.065800905227661, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -14.114990234375, "rewards/ppl_reward/std": 18.724578857421875, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.224347323179245, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 115.234375, "completions/mean_terminated_length": 115.234375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.8918671946390497, "grad_norm": 1.916333794593811, "kl": 2.8173828125, "learning_rate": 1.5641237264427115e-05, "loss": 0.0927, "num_tokens": 30031996.0, "reward": -0.9124755859375, "reward_std": 1.1083134412765503, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.512451171875, "rewards/ppl_reward/std": 3.4197440147399902, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1883249133825302, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 121.90625, "completions/mean_terminated_length": 121.90625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.8930855924459336, "grad_norm": 22.115882873535156, "kl": 3.984375, "learning_rate": 1.563420545792435e-05, "loss": 0.1607, "num_tokens": 30046614.0, "reward": -1.5333251953125, "reward_std": 0.7779991626739502, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.722900390625, "rewards/ppl_reward/std": 4.866729736328125, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 121.78125, "completions/mean_terminated_length": 121.78125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.8943039902528176, "grad_norm": 1.8882167339324951, "kl": 4.3046875, "learning_rate": 1.5627169567478627e-05, "loss": 0.1371, "num_tokens": 30061648.0, "reward": -3.785888671875, "reward_std": 1.5634657144546509, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -11.07177734375, "rewards/ppl_reward/std": 7.866837501525879, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.21764887869358063, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 132.046875, "completions/mean_terminated_length": 132.046875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.8955223880597014, "grad_norm": 2.042707681655884, "kl": 6.84765625, "learning_rate": 1.562012959818991e-05, "loss": 0.3742, "num_tokens": 30077691.0, "reward": -1.3328857421875, "reward_std": 0.7663302421569824, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.134521484375, "rewards/ppl_reward/std": 4.324173927307129, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.16060402989387512, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 123.453125, "completions/mean_terminated_length": 123.453125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.8967407858665855, "grad_norm": 2.2535598278045654, "kl": 7.4375, "learning_rate": 1.56130855551611e-05, "loss": 0.3619, "num_tokens": 30092560.0, "reward": -1.787109375, "reward_std": 1.2935278415679932, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -7.00390625, "rewards/ppl_reward/std": 5.134093284606934, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2043897658586502, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 123.65625, "completions/mean_terminated_length": 123.65625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.8979591836734695, "grad_norm": 2.140289068222046, "kl": 7.41796875, "learning_rate": 1.560603744349806e-05, "loss": 0.4819, "num_tokens": 30107266.0, "reward": -3.1334228515625, "reward_std": 2.50646710395813, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -9.790283203125, "rewards/ppl_reward/std": 11.772933959960938, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.16810208559036255, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 108.40625, "completions/mean_terminated_length": 108.40625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.8991775814803533, "grad_norm": 1.9970707893371582, "kl": 3.734375, "learning_rate": 1.55989852683096e-05, "loss": 0.0665, "num_tokens": 30120908.0, "reward": -1.2593994140625, "reward_std": 0.6361442804336548, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.229736328125, "rewards/ppl_reward/std": 2.8548192977905273, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 106.46875, "completions/mean_terminated_length": 106.46875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.9003959792872371, "grad_norm": 3.44171142578125, "kl": 4.029296875, "learning_rate": 1.5591929034707468e-05, "loss": 0.1636, "num_tokens": 30134682.0, "reward": -0.306884765625, "reward_std": 0.46808063983917236, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.36376953125, "rewards/ppl_reward/std": 2.529348134994507, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 135.46875, "completions/mean_terminated_length": 135.46875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.9016143770941212, "grad_norm": 3.50991153717041, "kl": 11.3125, "learning_rate": 1.5584868747806374e-05, "loss": 0.7, "num_tokens": 30152136.0, "reward": -2.1126708984375, "reward_std": 1.4538891315460205, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -7.631591796875, "rewards/ppl_reward/std": 4.538043975830078, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2083333432674408, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 125.59375, "completions/mean_terminated_length": 125.59375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.9028327749010052, "grad_norm": 6.829843044281006, "kl": 11.953125, "learning_rate": 1.557780441272395e-05, "loss": 0.5368, "num_tokens": 30167094.0, "reward": -3.2230224609375, "reward_std": 1.6803302764892578, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -9.852294921875, "rewards/ppl_reward/std": 7.8015289306640625, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2592533528804779, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 129.796875, "completions/mean_terminated_length": 129.796875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.9040511727078893, "grad_norm": 4.119789123535156, "kl": 11.0703125, "learning_rate": 1.5570736034580755e-05, "loss": 0.6458, "num_tokens": 30183105.0, "reward": -0.95703125, "reward_std": 0.9145181775093079, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.4609375, "rewards/ppl_reward/std": 2.6902973651885986, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 124.59375, "completions/mean_terminated_length": 124.59375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 1.905269570514773, "grad_norm": 6.62839937210083, "kl": 12.5390625, "learning_rate": 1.55636636185003e-05, "loss": 0.5903, "num_tokens": 30198055.0, "reward": -1.6083984375, "reward_std": 1.3067233562469482, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.560546875, "rewards/ppl_reward/std": 2.759361505508423, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.24750742316246033, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 124.171875, "completions/mean_terminated_length": 124.171875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.906487968321657, "grad_norm": 2.3206446170806885, "kl": 7.8046875, "learning_rate": 1.555658716960901e-05, "loss": 0.4036, "num_tokens": 30213306.0, "reward": -1.2213134765625, "reward_std": 0.8617711663246155, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.052001953125, "rewards/ppl_reward/std": 2.0821239948272705, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 104.96875, "completions/mean_terminated_length": 104.96875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.907706366128541, "grad_norm": 2.505138635635376, "kl": 5.4296875, "learning_rate": 1.554950669303624e-05, "loss": 0.2464, "num_tokens": 30226792.0, "reward": -1.9288330078125, "reward_std": 1.7143146991729736, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.599853515625, "rewards/ppl_reward/std": 6.822912216186523, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 119.75, "completions/mean_terminated_length": 119.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.908924763935425, "grad_norm": 4.0761308670043945, "kl": 8.1328125, "learning_rate": 1.554242219391425e-05, "loss": 0.365, "num_tokens": 30241384.0, "reward": -1.780517578125, "reward_std": 1.0414464473724365, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.15478515625, "rewards/ppl_reward/std": 6.527903079986572, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14689241349697113, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 102.765625, "completions/mean_terminated_length": 102.765625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.910143161742309, "grad_norm": 2.0279741287231445, "kl": 4.814453125, "learning_rate": 1.553533367737823e-05, "loss": 0.2379, "num_tokens": 30254577.0, "reward": -1.09033203125, "reward_std": 1.1045366525650024, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.8681640625, "rewards/ppl_reward/std": 3.7958147525787354, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.208927720785141, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 105.8125, "completions/mean_terminated_length": 105.8125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.9113615595491928, "grad_norm": 3.1864585876464844, "kl": 6.27734375, "learning_rate": 1.5528241148566276e-05, "loss": 0.2754, "num_tokens": 30267901.0, "reward": -6.7337646484375, "reward_std": 1.315735101699829, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -17.155029296875, "rewards/ppl_reward/std": 28.865135192871094, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 112.875, "completions/mean_terminated_length": 112.875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.9125799573560767, "grad_norm": 2.720089912414551, "kl": 3.09375, "learning_rate": 1.5521144612619398e-05, "loss": 0.1103, "num_tokens": 30282445.0, "reward": -1.9072265625, "reward_std": 0.8191651105880737, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.603515625, "rewards/ppl_reward/std": 4.333240032196045, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 115.265625, "completions/mean_terminated_length": 115.265625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.9137983551629607, "grad_norm": 2.8407318592071533, "kl": 4.462890625, "learning_rate": 1.5514044074681504e-05, "loss": 0.1357, "num_tokens": 30297054.0, "reward": -0.624755859375, "reward_std": 1.2110360860824585, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.79638671875, "rewards/ppl_reward/std": 4.266637325286865, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2567298710346222, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 109.0625, "completions/mean_terminated_length": 109.0625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.9150167529698447, "grad_norm": 2.3746612071990967, "kl": 3.1015625, "learning_rate": 1.5506939539899403e-05, "loss": 0.0668, "num_tokens": 30310802.0, "reward": -1.3665771484375, "reward_std": 0.9974607825279236, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.459716796875, "rewards/ppl_reward/std": 4.89141845703125, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 130.359375, "completions/mean_terminated_length": 130.359375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 1.9162351507767286, "grad_norm": 2.3987114429473877, "kl": 8.15625, "learning_rate": 1.5499831013422804e-05, "loss": 0.4236, "num_tokens": 30326081.0, "reward": -1.9080810546875, "reward_std": 1.0217154026031494, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.323974609375, "rewards/ppl_reward/std": 4.569608211517334, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2121305763721466, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 147.921875, "completions/mean_terminated_length": 147.921875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.9174535485836124, "grad_norm": 5.235422134399414, "kl": 11.015625, "learning_rate": 1.549271850040431e-05, "loss": 0.696, "num_tokens": 30342756.0, "reward": -1.054443359375, "reward_std": 1.1551645994186401, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.54638671875, "rewards/ppl_reward/std": 3.7109763622283936, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.21128857135772705, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 161.71875, "completions/mean_terminated_length": 148.03175354003906, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 1.9186719463904964, "grad_norm": 2.264104127883911, "kl": 9.482421875, "learning_rate": 1.5485602005999418e-05, "loss": 0.5964, "num_tokens": 30360394.0, "reward": -1.0504150390625, "reward_std": 0.6745349168777466, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.694580078125, "rewards/ppl_reward/std": 3.051274299621582, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2221602201461792, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 148.265625, "completions/mean_terminated_length": 148.265625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 1.9198903441973805, "grad_norm": 3.1856226921081543, "kl": 11.505859375, "learning_rate": 1.5478481535366496e-05, "loss": 0.7378, "num_tokens": 30376555.0, "reward": -1.5369873046875, "reward_std": 1.8949456214904785, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.628662109375, "rewards/ppl_reward/std": 7.173901557922363, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1953943371772766, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 134.890625, "completions/mean_terminated_length": 134.890625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.9211087420042645, "grad_norm": 2.104255437850952, "kl": 5.5625, "learning_rate": 1.5471357093666805e-05, "loss": 0.2456, "num_tokens": 30392300.0, "reward": -1.82080078125, "reward_std": 0.90740966796875, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.2978515625, "rewards/ppl_reward/std": 3.356708526611328, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 154.703125, "completions/mean_terminated_length": 140.90476989746094, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.9223271398111483, "grad_norm": 2.431011438369751, "kl": 9.52734375, "learning_rate": 1.546422868606449e-05, "loss": 0.571, "num_tokens": 30409513.0, "reward": -2.810791015625, "reward_std": 2.922956943511963, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -9.12939453125, "rewards/ppl_reward/std": 11.688194274902344, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2287265807390213, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 155.328125, "completions/mean_terminated_length": 141.53968811035156, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 1.9235455376180322, "grad_norm": 4.333205223083496, "kl": 7.2822265625, "learning_rate": 1.5457096317726552e-05, "loss": 0.6084, "num_tokens": 30426942.0, "reward": -0.0565185546875, "reward_std": 0.5555627346038818, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -3.878662109375, "rewards/ppl_reward/std": 1.6858980655670166, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11545931547880173, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 134.85714721679688, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.9247639354249162, "grad_norm": 2.5187265872955322, "kl": 6.69140625, "learning_rate": 1.5449959993822884e-05, "loss": 0.4224, "num_tokens": 30443422.0, "reward": -0.6334228515625, "reward_std": 0.8061861991882324, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.829345703125, "rewards/ppl_reward/std": 3.307499885559082, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18898223340511322, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 173.6875, "completions/mean_terminated_length": 160.19049072265625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.9259823332318002, "grad_norm": 2.5472915172576904, "kl": 10.0625, "learning_rate": 1.544281971952623e-05, "loss": 0.7203, "num_tokens": 30461674.0, "reward": -1.02740478515625, "reward_std": 1.123578429222107, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.6798095703125, "rewards/ppl_reward/std": 4.263004779815674, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.9272007310386843, "grad_norm": 2.845583915710449, "kl": 4.56640625, "learning_rate": 1.5435675500012212e-05, "loss": 0.2585, "num_tokens": 30478122.0, "reward": -1.5499267578125, "reward_std": 0.7593108415603638, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.818603515625, "rewards/ppl_reward/std": 4.381896018981934, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 143.453125, "completions/mean_terminated_length": 143.453125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.928419128845568, "grad_norm": 3.424084424972534, "kl": 4.4873046875, "learning_rate": 1.54285273404593e-05, "loss": 0.1548, "num_tokens": 30494391.0, "reward": -0.46014404296875, "reward_std": 0.49631139636039734, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.6390380859375, "rewards/ppl_reward/std": 1.8110312223434448, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 143.65625, "completions/mean_terminated_length": 143.65625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.929637526652452, "grad_norm": 1.7989598512649536, "kl": 4.169921875, "learning_rate": 1.5421375246048817e-05, "loss": 0.2165, "num_tokens": 30510161.0, "reward": -1.541748046875, "reward_std": 0.9719261527061462, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.85693359375, "rewards/ppl_reward/std": 4.423587322235107, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1846257895231247, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 144.03125, "completions/mean_terminated_length": 144.03125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.930855924459336, "grad_norm": 2.7768442630767822, "kl": 5.66796875, "learning_rate": 1.5414219221964954e-05, "loss": 0.3612, "num_tokens": 30525939.0, "reward": -1.366455078125, "reward_std": 0.7916396856307983, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.42822265625, "rewards/ppl_reward/std": 3.4502952098846436, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 139.484375, "completions/mean_terminated_length": 139.484375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 1.93207432226622, "grad_norm": 2.0982937812805176, "kl": 5.48828125, "learning_rate": 1.540705927339474e-05, "loss": 0.2819, "num_tokens": 30541754.0, "reward": -1.7623291015625, "reward_std": 0.6497159004211426, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.251220703125, "rewards/ppl_reward/std": 5.056296348571777, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 147.453125, "completions/mean_terminated_length": 133.53968811035156, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.9332927200731038, "grad_norm": 2.81240177154541, "kl": 7.4931640625, "learning_rate": 1.5399895405528043e-05, "loss": 0.4711, "num_tokens": 30558159.0, "reward": -0.858642578125, "reward_std": 1.088714599609375, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.39697265625, "rewards/ppl_reward/std": 3.308588743209839, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15545432269573212, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 141.40625, "completions/mean_terminated_length": 141.40625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.9345111178799879, "grad_norm": 2.327284336090088, "kl": 5.314453125, "learning_rate": 1.5392727623557585e-05, "loss": 0.2271, "num_tokens": 30574241.0, "reward": -3.4111328125, "reward_std": 2.5502676963806152, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -10.369140625, "rewards/ppl_reward/std": 13.424092292785645, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.24888142943382263, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 136.796875, "completions/mean_terminated_length": 136.796875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.9357295156868717, "grad_norm": 2.206840991973877, "kl": 4.98046875, "learning_rate": 1.538555593267892e-05, "loss": 0.2136, "num_tokens": 30589524.0, "reward": -0.694091796875, "reward_std": 0.4488915205001831, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.17724609375, "rewards/ppl_reward/std": 1.962573766708374, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 144.703125, "completions/mean_terminated_length": 144.703125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 1.9369479134937557, "grad_norm": 2.0555574893951416, "kl": 4.7421875, "learning_rate": 1.537838033809043e-05, "loss": 0.1941, "num_tokens": 30606369.0, "reward": -0.3470458984375, "reward_std": 0.44983261823654175, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.444091796875, "rewards/ppl_reward/std": 0.9490026831626892, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 176.546875, "completions/mean_terminated_length": 163.09524536132812, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 1.9381663113006398, "grad_norm": 2.846691131591797, "kl": 10.033203125, "learning_rate": 1.5371200844993332e-05, "loss": 0.8762, "num_tokens": 30624980.0, "reward": -0.635986328125, "reward_std": 0.8857396245002747, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.03759765625, "rewards/ppl_reward/std": 3.128664970397949, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 141.140625, "completions/mean_terminated_length": 141.140625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.9393847091075236, "grad_norm": 25.95339012145996, "kl": 4.3056640625, "learning_rate": 1.5364017458591668e-05, "loss": 0.1448, "num_tokens": 30640685.0, "reward": -0.9071044921875, "reward_std": 0.554387629032135, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.657958984375, "rewards/ppl_reward/std": 2.343862771987915, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 146.046875, "completions/mean_terminated_length": 146.046875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.9406031069144074, "grad_norm": 2.084617853164673, "kl": 5.15234375, "learning_rate": 1.5356830184092305e-05, "loss": 0.2687, "num_tokens": 30656784.0, "reward": -0.7037353515625, "reward_std": 0.7859290242195129, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.141845703125, "rewards/ppl_reward/std": 3.1483728885650635, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 1.9418215047212914, "grad_norm": 3.493312358856201, "kl": 13.71875, "learning_rate": 1.5349639026704916e-05, "loss": 0.8903, "num_tokens": 30674056.0, "reward": -1.1678466796875, "reward_std": 1.0396369695663452, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.835693359375, "rewards/ppl_reward/std": 3.1684553623199463, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.17747680842876434, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 179.34375, "completions/mean_terminated_length": 179.34375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 1.9430399025281755, "grad_norm": 3.6370527744293213, "kl": 13.8984375, "learning_rate": 1.534244399164201e-05, "loss": 0.9655, "num_tokens": 30692398.0, "reward": -1.8377685546875, "reward_std": 1.8847970962524414, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.113037109375, "rewards/ppl_reward/std": 6.136712074279785, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.23779743909835815, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 171.015625, "completions/mean_terminated_length": 143.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.9442583003350595, "grad_norm": 3.69073224067688, "kl": 10.603515625, "learning_rate": 1.5335245084118888e-05, "loss": 0.5872, "num_tokens": 30710711.0, "reward": -0.3065185546875, "reward_std": 0.8606666326522827, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.245849609375, "rewards/ppl_reward/std": 1.9528586864471436, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18225978314876556, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 134.78125, "completions/mean_terminated_length": 134.78125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.9454766981419434, "grad_norm": 2.3884341716766357, "kl": 3.78125, "learning_rate": 1.5328042309353655e-05, "loss": 0.0827, "num_tokens": 30726585.0, "reward": -1.48828125, "reward_std": 0.78727126121521, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.578125, "rewards/ppl_reward/std": 4.8006815910339355, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18225978314876556, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 131.453125, "completions/mean_terminated_length": 131.453125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.9466950959488272, "grad_norm": 3.6214940547943115, "kl": 3.8046875, "learning_rate": 1.532083567256725e-05, "loss": 0.1222, "num_tokens": 30741798.0, "reward": -1.9327392578125, "reward_std": 0.8339205980300903, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.576416015625, "rewards/ppl_reward/std": 5.482361316680908, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10789459943771362, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 149.046875, "completions/mean_terminated_length": 135.1587371826172, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.9479134937557112, "grad_norm": 3.6858391761779785, "kl": 5.5029296875, "learning_rate": 1.5313625178983368e-05, "loss": 0.4265, "num_tokens": 30757985.0, "reward": -1.285400390625, "reward_std": 1.2201671600341797, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.31298828125, "rewards/ppl_reward/std": 4.535982131958008, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.21445617079734802, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 124.234375, "completions/mean_terminated_length": 124.234375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 1.9491318915625953, "grad_norm": 2.1205036640167236, "kl": 2.96875, "learning_rate": 1.5306410833828534e-05, "loss": 0.181, "num_tokens": 30772496.0, "reward": -1.603759765625, "reward_std": 0.6043953895568848, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.05908203125, "rewards/ppl_reward/std": 4.638021469116211, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 135.48387145996094, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 1.9503502893694793, "grad_norm": 4.796030044555664, "kl": 9.796875, "learning_rate": 1.529919264233205e-05, "loss": 0.7549, "num_tokens": 30789848.0, "reward": -0.83953857421875, "reward_std": 2.074230670928955, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.2806396484375, "rewards/ppl_reward/std": 7.864829063415527, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.23854589462280273, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 128.34375, "completions/mean_terminated_length": 128.34375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 1.9515686871763631, "grad_norm": 2.0755724906921387, "kl": 2.58984375, "learning_rate": 1.5291970609726008e-05, "loss": 0.0958, "num_tokens": 30804838.0, "reward": -1.0286865234375, "reward_std": 0.6211654543876648, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.846435546875, "rewards/ppl_reward/std": 3.2510199546813965, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 136.109375, "completions/mean_terminated_length": 136.109375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.952787084983247, "grad_norm": 1.6927855014801025, "kl": 5.8837890625, "learning_rate": 1.5284744741245285e-05, "loss": 0.3296, "num_tokens": 30820325.0, "reward": -0.2646484375, "reward_std": 0.5330568552017212, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.232421875, "rewards/ppl_reward/std": 0.9669061303138733, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 125.96875, "completions/mean_terminated_length": 125.96875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 1.954005482790131, "grad_norm": 1.7875570058822632, "kl": 2.462890625, "learning_rate": 1.5277515042127528e-05, "loss": 0.0887, "num_tokens": 30835403.0, "reward": -0.9122314453125, "reward_std": 0.4399467408657074, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.597900390625, "rewards/ppl_reward/std": 2.749786138534546, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 133.359375, "completions/mean_terminated_length": 133.359375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 1.955223880597015, "grad_norm": 2.606275796890259, "kl": 6.98828125, "learning_rate": 1.5270281517613184e-05, "loss": 0.3076, "num_tokens": 30851002.0, "reward": -2.78594970703125, "reward_std": 0.8762016296386719, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -9.1109619140625, "rewards/ppl_reward/std": 8.973809242248535, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.19760315120220184, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 130.046875, "completions/mean_terminated_length": 130.046875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.9564422784038988, "grad_norm": 1.8309403657913208, "kl": 2.826171875, "learning_rate": 1.5263044172945456e-05, "loss": 0.088, "num_tokens": 30866325.0, "reward": -0.6119384765625, "reward_std": 0.4792807698249817, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.020751953125, "rewards/ppl_reward/std": 2.0700526237487793, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 138.34375, "completions/mean_terminated_length": 138.34375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 1.9576606762107827, "grad_norm": 2.4534404277801514, "kl": 6.15625, "learning_rate": 1.5255803013370314e-05, "loss": 0.2529, "num_tokens": 30881979.0, "reward": -1.08935546875, "reward_std": 0.9615140557289124, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.8349609375, "rewards/ppl_reward/std": 2.761817216873169, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 147.6875, "completions/mean_terminated_length": 147.6875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.9588790740176667, "grad_norm": 1.8896958827972412, "kl": 7.87890625, "learning_rate": 1.5248558044136502e-05, "loss": 0.5116, "num_tokens": 30898079.0, "reward": -2.192138671875, "reward_std": 1.2198582887649536, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.14990234375, "rewards/ppl_reward/std": 7.51807975769043, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13524486124515533, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 153.8125, "completions/mean_terminated_length": 140.00001525878906, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.9600974718245507, "grad_norm": 7.221668720245361, "kl": 11.546875, "learning_rate": 1.5241309270495524e-05, "loss": 0.5223, "num_tokens": 30914643.0, "reward": -1.196044921875, "reward_std": 1.2994040250778198, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.94677734375, "rewards/ppl_reward/std": 4.381568431854248, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.16810208559036255, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 153.40625, "completions/mean_terminated_length": 153.40625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.9613158696314348, "grad_norm": 1.5737730264663696, "kl": 4.474609375, "learning_rate": 1.5234056697701633e-05, "loss": 0.2213, "num_tokens": 30932269.0, "reward": -1.871337890625, "reward_std": 0.528454065322876, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.43798828125, "rewards/ppl_reward/std": 3.8706724643707275, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14471296966075897, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 139.828125, "completions/mean_terminated_length": 139.828125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 1.9625342674383186, "grad_norm": 2.1419949531555176, "kl": 2.4072265625, "learning_rate": 1.5226800331011853e-05, "loss": 0.0601, "num_tokens": 30948514.0, "reward": -1.7178955078125, "reward_std": 0.7016239166259766, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.240478515625, "rewards/ppl_reward/std": 4.571784019470215, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15344709157943726, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 156.078125, "completions/mean_terminated_length": 142.3015899658203, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 1.9637526652452024, "grad_norm": 22.63151741027832, "kl": 7.646484375, "learning_rate": 1.5219540175685938e-05, "loss": 0.3937, "num_tokens": 30966239.0, "reward": -0.44921875, "reward_std": 0.41793370246887207, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.6171875, "rewards/ppl_reward/std": 1.9632269144058228, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1883249133825302, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 144.953125, "completions/mean_terminated_length": 144.953125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.9649710630520865, "grad_norm": 3.2395009994506836, "kl": 9.07421875, "learning_rate": 1.5212276236986401e-05, "loss": 0.4848, "num_tokens": 30982524.0, "reward": -1.622802734375, "reward_std": 0.9592739939689636, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.72998046875, "rewards/ppl_reward/std": 2.35302472114563, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2366211861371994, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 155.328125, "completions/mean_terminated_length": 141.53968811035156, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.9661894608589705, "grad_norm": 1.8405537605285645, "kl": 8.0859375, "learning_rate": 1.520500852017849e-05, "loss": 0.4999, "num_tokens": 31000137.0, "reward": -0.5316162109375, "reward_std": 0.6994404792785645, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.735107421875, "rewards/ppl_reward/std": 3.4621925354003906, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 164.671875, "completions/mean_terminated_length": 136.9516143798828, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.9674078586658545, "grad_norm": 2.478459358215332, "kl": 10.87890625, "learning_rate": 1.5197737030530205e-05, "loss": 0.7548, "num_tokens": 31016820.0, "reward": -1.79144287109375, "reward_std": 1.0201095342636108, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.1688232421875, "rewards/ppl_reward/std": 5.147156715393066, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15545432269573212, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 145.703125, "completions/mean_terminated_length": 145.703125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 1.9686262564727384, "grad_norm": 3.209594964981079, "kl": 11.109375, "learning_rate": 1.5190461773312261e-05, "loss": 0.5999, "num_tokens": 31033193.0, "reward": -1.2646484375, "reward_std": 1.443204641342163, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.912109375, "rewards/ppl_reward/std": 3.424295425415039, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2431795299053192, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 121.21875, "completions/mean_terminated_length": 121.21875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.9698446542796222, "grad_norm": 2.6742846965789795, "kl": 4.71875, "learning_rate": 1.5183182753798123e-05, "loss": 0.1659, "num_tokens": 31047559.0, "reward": -1.2149658203125, "reward_std": 0.761581301689148, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.125244140625, "rewards/ppl_reward/std": 3.420703887939453, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18662993609905243, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 140.859375, "completions/mean_terminated_length": 140.859375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.9710630520865062, "grad_norm": 2.1645402908325195, "kl": 4.7177734375, "learning_rate": 1.5175899977263963e-05, "loss": 0.2553, "num_tokens": 31064262.0, "reward": -1.40576171875, "reward_std": 0.465236097574234, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.5927734375, "rewards/ppl_reward/std": 5.108100414276123, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1298656165599823, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 129.984375, "completions/mean_terminated_length": 129.984375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.9722814498933903, "grad_norm": 2.582463502883911, "kl": 4.8828125, "learning_rate": 1.51686134489887e-05, "loss": 0.1831, "num_tokens": 31079389.0, "reward": -1.3988037109375, "reward_std": 1.164079189300537, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.625732421875, "rewards/ppl_reward/std": 6.417712211608887, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11545931547880173, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 126.359375, "completions/mean_terminated_length": 126.359375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 1.973499847700274, "grad_norm": 3.7009572982788086, "kl": 4.37109375, "learning_rate": 1.5161323174253957e-05, "loss": 0.0828, "num_tokens": 31093996.0, "reward": -6.8609619140625, "reward_std": 11.853759765625, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -17.409423828125, "rewards/ppl_reward/std": 66.41004943847656, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.208927720785141, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 133.484375, "completions/mean_terminated_length": 133.484375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.9747182455071581, "grad_norm": 1.683333158493042, "kl": 3.802734375, "learning_rate": 1.5154029158344065e-05, "loss": 0.0917, "num_tokens": 31110227.0, "reward": -0.20849609375, "reward_std": 0.7104812264442444, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.1201171875, "rewards/ppl_reward/std": 1.2332580089569092, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 149.421875, "completions/mean_terminated_length": 149.421875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 1.975936643314042, "grad_norm": 1.987034797668457, "kl": 6.2734375, "learning_rate": 1.514673140654609e-05, "loss": 0.3369, "num_tokens": 31127582.0, "reward": -0.6514892578125, "reward_std": 0.6681226491928101, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.920166015625, "rewards/ppl_reward/std": 2.6663646697998047, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1677328199148178, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 136.453125, "completions/mean_terminated_length": 136.453125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 1.977155041120926, "grad_norm": 1.446904182434082, "kl": 3.291015625, "learning_rate": 1.5139429924149788e-05, "loss": 0.0753, "num_tokens": 31143835.0, "reward": -1.224365234375, "reward_std": 0.5229694843292236, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.15185546875, "rewards/ppl_reward/std": 2.3420372009277344, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 137.140625, "completions/mean_terminated_length": 137.140625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 1.97837343892781, "grad_norm": 1.9531718492507935, "kl": 7.14453125, "learning_rate": 1.5132124716447627e-05, "loss": 0.3387, "num_tokens": 31159700.0, "reward": -1.55029296875, "reward_std": 1.2273286581039429, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.6318359375, "rewards/ppl_reward/std": 4.435964584350586, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.23517554998397827, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 129.9375, "completions/mean_terminated_length": 129.9375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.9795918367346939, "grad_norm": 1.525835633277893, "kl": 3.587890625, "learning_rate": 1.5124815788734769e-05, "loss": 0.0848, "num_tokens": 31174568.0, "reward": 0.0360107421875, "reward_std": 0.6256375312805176, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -3.646728515625, "rewards/ppl_reward/std": 1.936026692390442, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 141.15625, "completions/mean_terminated_length": 127.14286804199219, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 1.9808102345415777, "grad_norm": 4.044806957244873, "kl": 4.8828125, "learning_rate": 1.5117503146309075e-05, "loss": 0.3823, "num_tokens": 31190914.0, "reward": -0.542724609375, "reward_std": 0.4164956212043762, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.81982421875, "rewards/ppl_reward/std": 1.8939098119735718, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 143.546875, "completions/mean_terminated_length": 143.546875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.9820286323484617, "grad_norm": 1.9701811075210571, "kl": 7.23828125, "learning_rate": 1.5110186794471105e-05, "loss": 0.3629, "num_tokens": 31207165.0, "reward": -1.28466796875, "reward_std": 0.6964172124862671, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.1474609375, "rewards/ppl_reward/std": 4.096346855163574, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.14412261545658112, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 139.953125, "completions/mean_terminated_length": 139.953125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 1.9832470301553458, "grad_norm": 5.423399925231934, "kl": 5.595703125, "learning_rate": 1.5102866738524101e-05, "loss": 0.35, "num_tokens": 31222658.0, "reward": -2.3453369140625, "reward_std": 0.794069230556488, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.464111328125, "rewards/ppl_reward/std": 5.931544780731201, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15344709157943726, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 130.9375, "completions/mean_terminated_length": 130.9375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 1.9844654279622298, "grad_norm": 2.313148260116577, "kl": 5.68359375, "learning_rate": 1.509554298377399e-05, "loss": 0.2507, "num_tokens": 31238214.0, "reward": -2.450927734375, "reward_std": 1.3273849487304688, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.54248046875, "rewards/ppl_reward/std": 4.886828422546387, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18617255985736847, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 130.78125, "completions/mean_terminated_length": 130.78125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 1.9856838257691136, "grad_norm": 2.9261655807495117, "kl": 4.8408203125, "learning_rate": 1.508821553552938e-05, "loss": 0.1283, "num_tokens": 31253104.0, "reward": -17.88079833984375, "reward_std": 18.23189353942871, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -39.2772216796875, "rewards/ppl_reward/std": 120.51515197753906, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21578919887542725, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 131.640625, "completions/mean_terminated_length": 131.640625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.9869022235759974, "grad_norm": 3.4995005130767822, "kl": 6.6591796875, "learning_rate": 1.5080884399101562e-05, "loss": 0.2106, "num_tokens": 31268385.0, "reward": -1.73309326171875, "reward_std": 0.9202439785003662, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.9271240234375, "rewards/ppl_reward/std": 5.111945629119873, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.1994769275188446, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 140.125, "completions/mean_terminated_length": 140.125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 1.9881206213828815, "grad_norm": 3.171689748764038, "kl": 4.53515625, "learning_rate": 1.5073549579804493e-05, "loss": 0.2619, "num_tokens": 31284793.0, "reward": -1.0068359375, "reward_std": 0.6819736957550049, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.740234375, "rewards/ppl_reward/std": 4.070128440856934, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 152.59375, "completions/mean_terminated_length": 152.59375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.9893390191897655, "grad_norm": 4.366892337799072, "kl": 8.1796875, "learning_rate": 1.5066211082954802e-05, "loss": 0.6008, "num_tokens": 31301503.0, "reward": -1.734375, "reward_std": 1.2788209915161133, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.0859375, "rewards/ppl_reward/std": 4.209129810333252, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.14211894571781158, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 144.90625, "completions/mean_terminated_length": 144.90625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.9905574169966496, "grad_norm": 2.8281426429748535, "kl": 4.896484375, "learning_rate": 1.5058868913871787e-05, "loss": 0.3385, "num_tokens": 31317489.0, "reward": -0.69049072265625, "reward_std": 0.644839882850647, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.1387939453125, "rewards/ppl_reward/std": 2.5463271141052246, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 152.0625, "completions/mean_terminated_length": 152.0625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.9917758148035334, "grad_norm": 2.158048152923584, "kl": 9.3828125, "learning_rate": 1.5051523077877403e-05, "loss": 0.5341, "num_tokens": 31334253.0, "reward": -0.9085693359375, "reward_std": 0.8240280151367188, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.403076171875, "rewards/ppl_reward/std": 4.0791544914245605, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 143.359375, "completions/mean_terminated_length": 143.359375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.9929942126104172, "grad_norm": 1.5781960487365723, "kl": 4.771484375, "learning_rate": 1.5044173580296267e-05, "loss": 0.1734, "num_tokens": 31351164.0, "reward": -1.091796875, "reward_std": 0.8088585138320923, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.81640625, "rewards/ppl_reward/std": 3.643747329711914, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1876239776611328, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 176.1875, "completions/mean_terminated_length": 148.8386993408203, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 1.9942126104173012, "grad_norm": 3.262951374053955, "kl": 14.2265625, "learning_rate": 1.5036820426455645e-05, "loss": 0.9582, "num_tokens": 31369776.0, "reward": -0.134521484375, "reward_std": 0.7965102791786194, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -3.69091796875, "rewards/ppl_reward/std": 0.9246326088905334, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.19527530670166016, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 142.3125, "completions/mean_terminated_length": 142.3125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.9954310082241853, "grad_norm": 3.331561326980591, "kl": 8.072265625, "learning_rate": 1.502946362168546e-05, "loss": 0.3584, "num_tokens": 31385484.0, "reward": -1.853515625, "reward_std": 0.9951585531234741, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.23046875, "rewards/ppl_reward/std": 5.650798320770264, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.14211894571781158, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 165.578125, "completions/mean_terminated_length": 165.578125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 1.996649406031069, "grad_norm": 2.437251329421997, "kl": 13.09375, "learning_rate": 1.5022103171318276e-05, "loss": 0.8623, "num_tokens": 31403193.0, "reward": -1.2181396484375, "reward_std": 1.2624881267547607, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.826904296875, "rewards/ppl_reward/std": 3.097360849380493, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.21921011805534363, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 161.21875, "completions/mean_terminated_length": 161.21875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.997867803837953, "grad_norm": 3.458012819290161, "kl": 12.07421875, "learning_rate": 1.5014739080689301e-05, "loss": 0.6279, "num_tokens": 31420895.0, "reward": -1.2203369140625, "reward_std": 1.3665430545806885, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.823486328125, "rewards/ppl_reward/std": 4.520509243011475, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.20590098202228546, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 132.7027130126953, "completions/mean_terminated_length": 132.7027130126953, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.999086201644837, "grad_norm": 1.671883225440979, "kl": 5.9921875, "learning_rate": 1.5007371355136382e-05, "loss": 0.2589, "num_tokens": 31435847.0, "reward": -0.7164306640625, "reward_std": 1.3844926357269287, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.010986328125, "rewards/ppl_reward/std": 3.673177480697632, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.22479599714279175, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 143.0625, "completions/mean_terminated_length": 143.0625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.001218397806884, "grad_norm": 2.044130802154541, "kl": 5.4140625, "learning_rate": 1.5000000000000002e-05, "loss": 0.1913, "num_tokens": 31452235.0, "reward": -1.46142578125, "reward_std": 1.2758122682571411, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.5087890625, "rewards/ppl_reward/std": 6.148505687713623, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17951758205890656, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 140.828125, "completions/mean_terminated_length": 140.828125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.002436795613768, "grad_norm": 1.5913312435150146, "kl": 5.5927734375, "learning_rate": 1.499262502062327e-05, "loss": 0.2415, "num_tokens": 31468912.0, "reward": -1.735595703125, "reward_std": 0.7913849353790283, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.94775390625, "rewards/ppl_reward/std": 6.330013275146484, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.20740120112895966, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 137.59375, "completions/mean_terminated_length": 123.52381896972656, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 2.0036551934206517, "grad_norm": 2.406425952911377, "kl": 5.19140625, "learning_rate": 1.4985246422351931e-05, "loss": 0.322, "num_tokens": 31483846.0, "reward": -1.552490234375, "reward_std": 1.6791138648986816, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.73779296875, "rewards/ppl_reward/std": 6.1356682777404785, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.23854589462280273, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 147.328125, "completions/mean_terminated_length": 147.328125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.0048735912275357, "grad_norm": 2.7002017498016357, "kl": 3.5, "learning_rate": 1.4977864210534341e-05, "loss": 0.2062, "num_tokens": 31501043.0, "reward": -1.22802734375, "reward_std": 0.738784670829773, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.1435546875, "rewards/ppl_reward/std": 3.1643736362457275, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.22271771728992462, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 133.578125, "completions/mean_terminated_length": 133.578125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 2.0060919890344198, "grad_norm": 2.9043517112731934, "kl": 4.771484375, "learning_rate": 1.4970478390521491e-05, "loss": 0.2193, "num_tokens": 31516512.0, "reward": 0.123291015625, "reward_std": 0.3655667304992676, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -3.50341796875, "rewards/ppl_reward/std": 0.6715517640113831, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 129.953125, "completions/mean_terminated_length": 129.953125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.007310386841304, "grad_norm": 1.8738349676132202, "kl": 2.93359375, "learning_rate": 1.496308896766697e-05, "loss": 0.0803, "num_tokens": 31531429.0, "reward": -0.4019775390625, "reward_std": 0.705520510673523, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.514892578125, "rewards/ppl_reward/std": 1.8288836479187012, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.19507674872875214, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 133.421875, "completions/mean_terminated_length": 133.421875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.008528784648188, "grad_norm": 8.382716178894043, "kl": 6.86328125, "learning_rate": 1.4955695947326987e-05, "loss": 0.3059, "num_tokens": 31546824.0, "reward": -2.03271484375, "reward_std": 0.7419736385345459, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.7138671875, "rewards/ppl_reward/std": 4.499762058258057, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11672777682542801, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 148.609375, "completions/mean_terminated_length": 148.609375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.0097471824550714, "grad_norm": 3.04420804977417, "kl": 7.23828125, "learning_rate": 1.4948299334860364e-05, "loss": 0.3727, "num_tokens": 31563735.0, "reward": -3.4586181640625, "reward_std": 2.1534578800201416, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -10.432861328125, "rewards/ppl_reward/std": 10.098666191101074, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2735668122768402, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 128.1875, "completions/mean_terminated_length": 128.1875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 2.0109655802619555, "grad_norm": 2.1857900619506836, "kl": 2.666015625, "learning_rate": 1.494089913562852e-05, "loss": 0.0382, "num_tokens": 31579035.0, "reward": -3.00830078125, "reward_std": 0.940266489982605, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -9.6962890625, "rewards/ppl_reward/std": 13.742242813110352, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1846257895231247, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 135.84375, "completions/mean_terminated_length": 135.84375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.0121839780688395, "grad_norm": 1.4325801134109497, "kl": 3.7861328125, "learning_rate": 1.4933495354995474e-05, "loss": 0.1216, "num_tokens": 31594801.0, "reward": -2.61376953125, "reward_std": 0.8267296552658081, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.8916015625, "rewards/ppl_reward/std": 12.009970664978027, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14471296966075897, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 130.640625, "completions/mean_terminated_length": 130.640625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 2.0134023758757236, "grad_norm": 2.2819814682006836, "kl": 6.4921875, "learning_rate": 1.4926087998327838e-05, "loss": 0.2701, "num_tokens": 31610258.0, "reward": -2.6280517578125, "reward_std": 0.8044776916503906, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.857666015625, "rewards/ppl_reward/std": 6.1941704750061035, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.17102740705013275, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 136.953125, "completions/mean_terminated_length": 136.953125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 2.014620773682607, "grad_norm": 2.8896684646606445, "kl": 6.9560546875, "learning_rate": 1.4918677070994828e-05, "loss": 0.305, "num_tokens": 31626031.0, "reward": -1.126953125, "reward_std": 0.6322118043899536, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.85546875, "rewards/ppl_reward/std": 3.9765257835388184, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 121.515625, "completions/mean_terminated_length": 121.515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.015839171489491, "grad_norm": 3.715240478515625, "kl": 6.44921875, "learning_rate": 1.4911262578368233e-05, "loss": 0.2666, "num_tokens": 31640064.0, "reward": -3.5802001953125, "reward_std": 1.1660594940185547, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -10.566650390625, "rewards/ppl_reward/std": 7.753436088562012, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.233588308095932, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 141.25, "completions/mean_terminated_length": 141.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.0170575692963753, "grad_norm": 1.6711995601654053, "kl": 4.890625, "learning_rate": 1.490384452582244e-05, "loss": 0.1983, "num_tokens": 31656256.0, "reward": -1.28271484375, "reward_std": 0.5450683236122131, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.0966796875, "rewards/ppl_reward/std": 2.6833972930908203, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.14773420989513397, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 123.484375, "completions/mean_terminated_length": 123.484375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 2.0182759671032593, "grad_norm": 3.1192002296447754, "kl": 6.97265625, "learning_rate": 1.4896422918734405e-05, "loss": 0.2472, "num_tokens": 31670631.0, "reward": -3.4241943359375, "reward_std": 1.6545639038085938, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -10.137451171875, "rewards/ppl_reward/std": 9.130705833435059, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2106640487909317, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 138.640625, "completions/mean_terminated_length": 138.640625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 2.0194943649101433, "grad_norm": 3.818225622177124, "kl": 7.578125, "learning_rate": 1.488899776248367e-05, "loss": 0.3741, "num_tokens": 31685832.0, "reward": -7.1319580078125, "reward_std": 12.73863410949707, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -17.490478515625, "rewards/ppl_reward/std": 66.25212860107422, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.23566938936710358, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 130.0625, "completions/mean_terminated_length": 130.0625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 2.020712762717027, "grad_norm": 2.143571376800537, "kl": 5.791015625, "learning_rate": 1.488156906245234e-05, "loss": 0.3059, "num_tokens": 31700884.0, "reward": -2.279541015625, "reward_std": 1.686943769454956, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.467176616191864, "rewards/ppl_reward/mean": -7.62939453125, "rewards/ppl_reward/std": 6.534449577331543, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.27680572867393494, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 135.359375, "completions/mean_terminated_length": 135.359375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 2.021931160523911, "grad_norm": 2.0668420791625977, "kl": 4.171875, "learning_rate": 1.4874136824025102e-05, "loss": 0.1061, "num_tokens": 31716747.0, "reward": -0.6357421875, "reward_std": 0.7312569618225098, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -4.521484375, "rewards/ppl_reward/std": 1.8391444683074951, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.19669894874095917, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 142.140625, "completions/mean_terminated_length": 142.140625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 2.023149558330795, "grad_norm": 2.1289422512054443, "kl": 3.96875, "learning_rate": 1.4866701052589195e-05, "loss": 0.2507, "num_tokens": 31732372.0, "reward": -1.7666015625, "reward_std": 1.3565104007720947, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.041015625, "rewards/ppl_reward/std": 6.60418701171875, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.15900352597236633, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 135.453125, "completions/mean_terminated_length": 135.453125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.024367956137679, "grad_norm": 3.3787968158721924, "kl": 6.1875, "learning_rate": 1.4859261753534417e-05, "loss": 0.279, "num_tokens": 31748409.0, "reward": -1.626953125, "reward_std": 1.0066328048706055, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -6.46484375, "rewards/ppl_reward/std": 3.8849759101867676, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.2672322392463684, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 131.5, "completions/mean_terminated_length": 131.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 2.025586353944563, "grad_norm": 1.9172918796539307, "kl": 5.171875, "learning_rate": 1.4851818932253137e-05, "loss": 0.0851, "num_tokens": 31763777.0, "reward": -1.239990234375, "reward_std": 1.5755072832107544, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -5.65966796875, "rewards/ppl_reward/std": 3.152522087097168, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.3014669418334961, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 131.8125, "completions/mean_terminated_length": 131.8125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 2.0268047517514467, "grad_norm": 2.3263301849365234, "kl": 4.19140625, "learning_rate": 1.4844372594140271e-05, "loss": 0.1067, "num_tokens": 31779037.0, "reward": -2.2265625, "reward_std": 0.7901493906974792, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -7.734375, "rewards/ppl_reward/std": 6.356956481933594, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.18633900582790375, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 141.109375, "completions/mean_terminated_length": 141.109375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 2.0280231495583307, "grad_norm": 1.8277945518493652, "kl": 2.9453125, "learning_rate": 1.4836922744593281e-05, "loss": 0.1148, "num_tokens": 31795820.0, "reward": -3.38067626953125, "reward_std": 2.9682862758636475, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -10.5113525390625, "rewards/ppl_reward/std": 19.413541793823242, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1510545015335083, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 129.53125, "completions/mean_terminated_length": 129.53125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 2.029241547365215, "grad_norm": 2.097201108932495, "kl": 5.984375, "learning_rate": 1.4829469389012173e-05, "loss": 0.2991, "num_tokens": 31810662.0, "reward": -0.8306884765625, "reward_std": 0.9809654355049133, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -5.005126953125, "rewards/ppl_reward/std": 3.7674477100372314, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 120.8125, "completions/mean_terminated_length": 120.8125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.030459945172099, "grad_norm": 2.5058252811431885, "kl": 3.88671875, "learning_rate": 1.48220125327995e-05, "loss": 0.1158, "num_tokens": 31824890.0, "reward": -2.7987060546875, "reward_std": 0.9191290736198425, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -9.136474609375, "rewards/ppl_reward/std": 9.968810081481934, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1526367962360382, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 140.859375, "completions/mean_terminated_length": 140.859375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.0316783429789824, "grad_norm": 3.6549785137176514, "kl": 2.6142578125, "learning_rate": 1.4814552181360346e-05, "loss": 0.0483, "num_tokens": 31841609.0, "reward": -1.558349609375, "reward_std": 0.39634788036346436, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.87451171875, "rewards/ppl_reward/std": 3.7419722080230713, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 151.03125, "completions/mean_terminated_length": 151.03125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 2.0328967407858665, "grad_norm": 5.3821868896484375, "kl": 5.1171875, "learning_rate": 1.4807088340102342e-05, "loss": 0.3001, "num_tokens": 31859019.0, "reward": -1.6126708984375, "reward_std": 0.5185451507568359, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.850341796875, "rewards/ppl_reward/std": 5.63370418548584, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.0833333358168602, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 149.015625, "completions/mean_terminated_length": 149.015625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.0341151385927505, "grad_norm": 6.001331806182861, "kl": 4.70703125, "learning_rate": 1.4799621014435627e-05, "loss": 0.1933, "num_tokens": 31875956.0, "reward": -0.88232421875, "reward_std": 0.9651365876197815, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.2802734375, "rewards/ppl_reward/std": 3.692610263824463, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19654127955436707, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 138.421875, "completions/mean_terminated_length": 138.421875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 2.0353335363996345, "grad_norm": 2.1170387268066406, "kl": 4.640625, "learning_rate": 1.4792150209772888e-05, "loss": 0.2434, "num_tokens": 31892007.0, "reward": -1.3558349609375, "reward_std": 0.7127922773361206, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.352294921875, "rewards/ppl_reward/std": 1.9166786670684814, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 145.078125, "completions/mean_terminated_length": 145.078125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.0365519342065186, "grad_norm": 1.5203943252563477, "kl": 5.400390625, "learning_rate": 1.4784675931529316e-05, "loss": 0.3011, "num_tokens": 31908236.0, "reward": -1.3306884765625, "reward_std": 1.0989832878112793, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.192626953125, "rewards/ppl_reward/std": 4.400771617889404, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1836577206850052, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 137.75, "completions/mean_terminated_length": 137.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.037770332013402, "grad_norm": 2.3685712814331055, "kl": 6.6875, "learning_rate": 1.477719818512263e-05, "loss": 0.3342, "num_tokens": 31923812.0, "reward": -0.98236083984375, "reward_std": 0.920430064201355, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.4022216796875, "rewards/ppl_reward/std": 2.636850118637085, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2083333432674408, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 136.515625, "completions/mean_terminated_length": 136.515625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 2.0389887298202862, "grad_norm": 1.6839808225631714, "kl": 5.734375, "learning_rate": 1.4769716975973063e-05, "loss": 0.3092, "num_tokens": 31939565.0, "reward": -0.87841796875, "reward_std": 0.9099481105804443, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.3115234375, "rewards/ppl_reward/std": 2.9106132984161377, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18496133387088776, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 149.5625, "completions/mean_terminated_length": 149.5625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 2.0402071276271703, "grad_norm": 1.7899447679519653, "kl": 5.1416015625, "learning_rate": 1.4762232309503349e-05, "loss": 0.3569, "num_tokens": 31956417.0, "reward": -1.0291748046875, "reward_std": 0.5582789778709412, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.863037109375, "rewards/ppl_reward/std": 3.322777032852173, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 138.828125, "completions/mean_terminated_length": 138.828125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 2.0414255254340543, "grad_norm": 3.709301233291626, "kl": 8.01953125, "learning_rate": 1.4754744191138734e-05, "loss": 0.375, "num_tokens": 31971942.0, "reward": -0.67828369140625, "reward_std": 0.9311918020248413, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -4.8878173828125, "rewards/ppl_reward/std": 2.6947124004364014, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17251639068126678, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 125.328125, "completions/mean_terminated_length": 125.328125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.0426439232409384, "grad_norm": 3.640941619873047, "kl": 5.513671875, "learning_rate": 1.4747252626306973e-05, "loss": 0.2005, "num_tokens": 31986595.0, "reward": -1.0361328125, "reward_std": 0.7216653227806091, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.798828125, "rewards/ppl_reward/std": 2.579253673553467, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 143.375, "completions/mean_terminated_length": 143.375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.043862321047822, "grad_norm": 4.20949649810791, "kl": 10.0859375, "learning_rate": 1.4739757620438308e-05, "loss": 0.5206, "num_tokens": 32002667.0, "reward": -0.4110107421875, "reward_std": 0.7474299669265747, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.525146484375, "rewards/ppl_reward/std": 1.6891976594924927, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 155.890625, "completions/mean_terminated_length": 155.890625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.045080718854706, "grad_norm": 2.3980188369750977, "kl": 8.67578125, "learning_rate": 1.473225917896548e-05, "loss": 0.5329, "num_tokens": 32019812.0, "reward": -0.5174560546875, "reward_std": 0.8630033731460571, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.722412109375, "rewards/ppl_reward/std": 1.7246601581573486, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 151.4375, "completions/mean_terminated_length": 151.4375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 2.04629911666159, "grad_norm": 2.9635210037231445, "kl": 11.3046875, "learning_rate": 1.4724757307323718e-05, "loss": 0.6856, "num_tokens": 32036816.0, "reward": -1.144287109375, "reward_std": 0.8199037313461304, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.94482421875, "rewards/ppl_reward/std": 2.2485527992248535, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09449111670255661, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 134.09375, "completions/mean_terminated_length": 134.09375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.047517514468474, "grad_norm": 1.968400001525879, "kl": 8.1484375, "learning_rate": 1.4717252010950743e-05, "loss": 0.4281, "num_tokens": 32051766.0, "reward": -1.70068359375, "reward_std": 1.5606327056884766, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.9716796875, "rewards/ppl_reward/std": 6.108781337738037, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1979166716337204, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 165.578125, "completions/mean_terminated_length": 151.952392578125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 2.048735912275358, "grad_norm": 3.415476083755493, "kl": 11.84375, "learning_rate": 1.4709743295286751e-05, "loss": 0.6616, "num_tokens": 32069435.0, "reward": -0.939453125, "reward_std": 1.1231098175048828, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.46484375, "rewards/ppl_reward/std": 3.375401735305786, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.13495801389217377, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 151.859375, "completions/mean_terminated_length": 151.859375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 2.0499543100822417, "grad_norm": 2.1574952602386475, "kl": 7.7119140625, "learning_rate": 1.4702231165774423e-05, "loss": 0.4885, "num_tokens": 32086330.0, "reward": -1.9781494140625, "reward_std": 1.4653130769729614, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.557861328125, "rewards/ppl_reward/std": 5.633084297180176, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.15900352597236633, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 153.78125, "completions/mean_terminated_length": 153.78125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.0511727078891258, "grad_norm": 3.1352968215942383, "kl": 10.4296875, "learning_rate": 1.469471562785891e-05, "loss": 0.5205, "num_tokens": 32103692.0, "reward": -0.9818115234375, "reward_std": 0.5690417289733887, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.440185546875, "rewards/ppl_reward/std": 2.1144120693206787, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.104981929063797, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 135.125, "completions/mean_terminated_length": 135.125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.05239110569601, "grad_norm": 2.888432264328003, "kl": 2.669921875, "learning_rate": 1.4687196686987832e-05, "loss": 0.0909, "num_tokens": 32119308.0, "reward": -2.3468017578125, "reward_std": 1.3169524669647217, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.498291015625, "rewards/ppl_reward/std": 5.507279396057129, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 130.015625, "completions/mean_terminated_length": 130.015625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.053609503502894, "grad_norm": 2.2802629470825195, "kl": 3.78515625, "learning_rate": 1.4679674348611288e-05, "loss": 0.1616, "num_tokens": 32134789.0, "reward": -1.39892578125, "reward_std": 0.6388722658157349, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.6103515625, "rewards/ppl_reward/std": 3.3053793907165527, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 145.203125, "completions/mean_terminated_length": 145.203125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 2.0548279013097774, "grad_norm": 2.1128947734832764, "kl": 3.9365234375, "learning_rate": 1.467214861818182e-05, "loss": 0.189, "num_tokens": 32150954.0, "reward": -1.51226806640625, "reward_std": 0.4415927231311798, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.8682861328125, "rewards/ppl_reward/std": 3.526700258255005, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 141.671875, "completions/mean_terminated_length": 141.671875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 2.0560462991166615, "grad_norm": 3.8649799823760986, "kl": 4.625, "learning_rate": 1.4664619501154445e-05, "loss": 0.4391, "num_tokens": 32166565.0, "reward": -1.401611328125, "reward_std": 0.5376484394073486, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.70947265625, "rewards/ppl_reward/std": 6.778341770172119, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 156.984375, "completions/mean_terminated_length": 143.22222900390625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.0572646969235455, "grad_norm": 2.6872732639312744, "kl": 5.08203125, "learning_rate": 1.4657087002986628e-05, "loss": 0.3425, "num_tokens": 32183588.0, "reward": -1.555908203125, "reward_std": 0.4227454960346222, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.86181640625, "rewards/ppl_reward/std": 2.8789618015289307, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.0584830947304296, "grad_norm": 4.027123928070068, "kl": 7.16796875, "learning_rate": 1.4649551129138284e-05, "loss": 0.5844, "num_tokens": 32199908.0, "reward": -0.6988525390625, "reward_std": 1.0465904474258423, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.038330078125, "rewards/ppl_reward/std": 4.071534633636475, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15728822350502014, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 142.75, "completions/mean_terminated_length": 142.75, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.0597014925373136, "grad_norm": 2.7578272819519043, "kl": 3.224609375, "learning_rate": 1.464201188507178e-05, "loss": 0.1872, "num_tokens": 32216068.0, "reward": -0.7557373046875, "reward_std": 0.40027064085006714, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.378662109375, "rewards/ppl_reward/std": 1.572906255722046, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 177.84375, "completions/mean_terminated_length": 150.5483856201172, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 2.060919890344197, "grad_norm": 2.7743449211120605, "kl": 10.421875, "learning_rate": 1.4634469276251919e-05, "loss": 0.7561, "num_tokens": 32234362.0, "reward": -2.6324462890625, "reward_std": 0.8784276247024536, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.850830078125, "rewards/ppl_reward/std": 3.8097007274627686, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.13495801389217377, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 174.078125, "completions/mean_terminated_length": 160.58731079101562, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.0621382881510812, "grad_norm": 2.0811641216278076, "kl": 7.53125, "learning_rate": 1.4626923308145948e-05, "loss": 0.519, "num_tokens": 32252695.0, "reward": -1.774658203125, "reward_std": 0.5841867923736572, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.27587890625, "rewards/ppl_reward/std": 2.18880033493042, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13768701255321503, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 205.453125, "completions/mean_terminated_length": 179.0483856201172, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.0633566859579653, "grad_norm": 7.785551071166992, "kl": 16.8125, "learning_rate": 1.4619373986223548e-05, "loss": 1.0549, "num_tokens": 32272860.0, "reward": -1.07257080078125, "reward_std": 0.565564751625061, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.8326416015625, "rewards/ppl_reward/std": 3.116334915161133, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 164.625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 2.0645750837648493, "grad_norm": 2.56260347366333, "kl": 7.125, "learning_rate": 1.4611821315956829e-05, "loss": 0.3614, "num_tokens": 32290444.0, "reward": -1.6492919921875, "reward_std": 0.4201158583164215, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.048583984375, "rewards/ppl_reward/std": 4.367459774017334, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 147.796875, "completions/mean_terminated_length": 147.796875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.0657934815717334, "grad_norm": 2.737790107727051, "kl": 4.6123046875, "learning_rate": 1.4604265302820333e-05, "loss": 0.1625, "num_tokens": 32306591.0, "reward": -1.57891845703125, "reward_std": 1.142917275428772, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.8843994140625, "rewards/ppl_reward/std": 6.770274639129639, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 189.46875, "completions/mean_terminated_length": 176.22222900390625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 2.067011879378617, "grad_norm": 10.443564414978027, "kl": 17.796875, "learning_rate": 1.4596705952291017e-05, "loss": 0.9598, "num_tokens": 32325677.0, "reward": -1.24322509765625, "reward_std": 0.9781322479248047, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.9552001953125, "rewards/ppl_reward/std": 3.6760244369506836, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.22658175230026245, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 184.875, "completions/mean_terminated_length": 184.875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.068230277185501, "grad_norm": 5.9169158935546875, "kl": 13.73046875, "learning_rate": 1.4589143269848261e-05, "loss": 0.8106, "num_tokens": 32344293.0, "reward": -0.6168212890625, "reward_std": 0.7079316973686218, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -4.741455078125, "rewards/ppl_reward/std": 1.6121597290039062, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1925172060728073, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 145.921875, "completions/mean_terminated_length": 145.921875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 2.069448674992385, "grad_norm": 2.5470972061157227, "kl": 7.05859375, "learning_rate": 1.4581577260973866e-05, "loss": 0.3864, "num_tokens": 32359624.0, "reward": -0.795654296875, "reward_std": 0.7175304889678955, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.19287109375, "rewards/ppl_reward/std": 2.2657339572906494, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1767328530550003, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 207.296875, "completions/mean_terminated_length": 167.13113403320312, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.070667072799269, "grad_norm": 2.344278335571289, "kl": 13.70703125, "learning_rate": 1.4574007931152037e-05, "loss": 0.9773, "num_tokens": 32380211.0, "reward": -0.78546142578125, "reward_std": 0.9280201196670532, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.1646728515625, "rewards/ppl_reward/std": 3.214315414428711, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.23517554998397827, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 176.390625, "completions/mean_terminated_length": 162.9365234375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 2.071885470606153, "grad_norm": 2.8098886013031006, "kl": 7.6103515625, "learning_rate": 1.4566435285869385e-05, "loss": 0.5063, "num_tokens": 32398124.0, "reward": -0.17919921875, "reward_std": 0.6086081266403198, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.1474609375, "rewards/ppl_reward/std": 2.305509567260742, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 184.3125, "completions/mean_terminated_length": 170.984130859375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 2.0731038684130367, "grad_norm": 2.189943313598633, "kl": 7.224609375, "learning_rate": 1.4558859330614932e-05, "loss": 0.5131, "num_tokens": 32417856.0, "reward": -1.52764892578125, "reward_std": 0.8589963912963867, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.8287353515625, "rewards/ppl_reward/std": 3.9009010791778564, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.20009763538837433, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 156.578125, "completions/mean_terminated_length": 156.578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.0743222662199208, "grad_norm": 1.6343648433685303, "kl": 3.6435546875, "learning_rate": 1.4551280070880089e-05, "loss": 0.1629, "num_tokens": 32435053.0, "reward": -7.137451171875, "reward_std": 4.259431838989258, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -18.00927734375, "rewards/ppl_reward/std": 35.14144515991211, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 175.328125, "completions/mean_terminated_length": 175.328125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.075540664026805, "grad_norm": 2.2895424365997314, "kl": 6.7099609375, "learning_rate": 1.4543697512158673e-05, "loss": 0.4805, "num_tokens": 32453410.0, "reward": -2.6490478515625, "reward_std": 1.8938755989074707, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -9.040283203125, "rewards/ppl_reward/std": 11.279006958007812, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1846257895231247, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 137.3125, "completions/mean_terminated_length": 137.3125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 2.076759061833689, "grad_norm": 3.032179355621338, "kl": 1.5703125, "learning_rate": 1.4536111659946883e-05, "loss": 0.0496, "num_tokens": 32468734.0, "reward": -0.3505859375, "reward_std": 0.2412191927433014, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.591796875, "rewards/ppl_reward/std": 1.9588191509246826, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 147.65625, "completions/mean_terminated_length": 147.65625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 2.0779774596405725, "grad_norm": 1.7894418239593506, "kl": 1.6416015625, "learning_rate": 1.4528522519743316e-05, "loss": -0.0039, "num_tokens": 32485136.0, "reward": -0.9403076171875, "reward_std": 0.3254528343677521, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.802490234375, "rewards/ppl_reward/std": 6.529156684875488, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 147.1875, "completions/mean_terminated_length": 147.1875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 2.0791958574474565, "grad_norm": 1.9324941635131836, "kl": 4.40625, "learning_rate": 1.4520930097048936e-05, "loss": 0.1711, "num_tokens": 32501180.0, "reward": -0.7626953125, "reward_std": 0.522926926612854, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.275390625, "rewards/ppl_reward/std": 1.8433475494384766, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 130.53125, "completions/mean_terminated_length": 130.53125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 2.0804142552543405, "grad_norm": 2.1457021236419678, "kl": 5.015625, "learning_rate": 1.4513334397367103e-05, "loss": 0.1449, "num_tokens": 32515798.0, "reward": -4.7935791015625, "reward_std": 1.4326059818267822, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -13.251220703125, "rewards/ppl_reward/std": 15.513420104980469, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13768701255321503, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.0816326530612246, "grad_norm": 2.8970859050750732, "kl": 5.4765625, "learning_rate": 1.4505735426203545e-05, "loss": 0.252, "num_tokens": 32532270.0, "reward": -2.755859375, "reward_std": 2.459861993789673, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -8.93359375, "rewards/ppl_reward/std": 10.300494194030762, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.209963858127594, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 153.28125, "completions/mean_terminated_length": 153.28125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 2.0828510508681086, "grad_norm": 2.638155460357666, "kl": 5.91015625, "learning_rate": 1.4498133189066358e-05, "loss": 0.2077, "num_tokens": 32549712.0, "reward": -3.0225830078125, "reward_std": 1.038276195526123, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -9.537353515625, "rewards/ppl_reward/std": 7.33127498626709, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2184663861989975, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 150.078125, "completions/mean_terminated_length": 150.078125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.084069448674992, "grad_norm": 4.722658157348633, "kl": 9.484375, "learning_rate": 1.4490527691466007e-05, "loss": 0.4484, "num_tokens": 32566237.0, "reward": -2.1763916015625, "reward_std": 1.461301326751709, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -7.735595703125, "rewards/ppl_reward/std": 8.485307693481445, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.18729320168495178, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 138.59375, "completions/mean_terminated_length": 138.59375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 2.0852878464818763, "grad_norm": 1.829606533050537, "kl": 4.57421875, "learning_rate": 1.4482918938915331e-05, "loss": 0.1751, "num_tokens": 32581635.0, "reward": -1.810546875, "reward_std": 0.8896722197532654, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.13671875, "rewards/ppl_reward/std": 2.904866933822632, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 142.75, "completions/mean_terminated_length": 142.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.0865062442887603, "grad_norm": 3.870772361755371, "kl": 8.2421875, "learning_rate": 1.4475306936929513e-05, "loss": 0.4964, "num_tokens": 32597179.0, "reward": -3.1492919921875, "reward_std": 1.447890281677246, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -9.868896484375, "rewards/ppl_reward/std": 8.144418716430664, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.21675680577754974, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 134.796875, "completions/mean_terminated_length": 134.796875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 2.0877246420956443, "grad_norm": 3.2111339569091797, "kl": 5.8125, "learning_rate": 1.4467691691026098e-05, "loss": 0.269, "num_tokens": 32612278.0, "reward": -0.8768310546875, "reward_std": 1.1514965295791626, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.238037109375, "rewards/ppl_reward/std": 2.957134246826172, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15728822350502014, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 138.53125, "completions/mean_terminated_length": 138.53125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.0889430399025284, "grad_norm": 3.3245577812194824, "kl": 6.57421875, "learning_rate": 1.4460073206724984e-05, "loss": 0.2606, "num_tokens": 32627696.0, "reward": -2.2078857421875, "reward_std": 0.9152010083198547, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.978271484375, "rewards/ppl_reward/std": 7.365997791290283, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.16060402989387512, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 140.65625, "completions/mean_terminated_length": 140.65625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.090161437709412, "grad_norm": 1.8477470874786377, "kl": 3.9296875, "learning_rate": 1.4452451489548416e-05, "loss": 0.1205, "num_tokens": 32644034.0, "reward": -0.445068359375, "reward_std": 0.6952501535415649, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.43701171875, "rewards/ppl_reward/std": 2.405320405960083, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20638974010944366, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 141.828125, "completions/mean_terminated_length": 141.828125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.091379835516296, "grad_norm": 2.4130282402038574, "kl": 2.091796875, "learning_rate": 1.4444826545020979e-05, "loss": 0.023, "num_tokens": 32660135.0, "reward": -1.8258056640625, "reward_std": 0.4765937924385071, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.362548828125, "rewards/ppl_reward/std": 4.220510005950928, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1249379813671112, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 142.59375, "completions/mean_terminated_length": 142.59375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 2.09259823332318, "grad_norm": 1.4669450521469116, "kl": 3.6171875, "learning_rate": 1.4437198378669598e-05, "loss": 0.0911, "num_tokens": 32675997.0, "reward": -1.5804443359375, "reward_std": 0.7341619729995728, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.840576171875, "rewards/ppl_reward/std": 3.683891534805298, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 163.859375, "completions/mean_terminated_length": 163.859375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 2.093816631130064, "grad_norm": 5.474968433380127, "kl": 3.12109375, "learning_rate": 1.4429566996023539e-05, "loss": 0.2027, "num_tokens": 32694244.0, "reward": -1.4150390625, "reward_std": 0.39655110239982605, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.673828125, "rewards/ppl_reward/std": 2.7687621116638184, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 143.015625, "completions/mean_terminated_length": 143.015625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.0950350289369477, "grad_norm": 2.9943528175354004, "kl": 3.740234375, "learning_rate": 1.442193240261439e-05, "loss": 0.1466, "num_tokens": 32710205.0, "reward": -3.013427734375, "reward_std": 0.5272377133369446, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -9.79248046875, "rewards/ppl_reward/std": 12.290464401245117, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 142.875, "completions/mean_terminated_length": 142.875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.0962534267438317, "grad_norm": 1.745720624923706, "kl": 5.72265625, "learning_rate": 1.4414294603976076e-05, "loss": 0.2594, "num_tokens": 32725909.0, "reward": -0.5816650390625, "reward_std": 0.8517223596572876, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.835205078125, "rewards/ppl_reward/std": 2.5719480514526367, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.23345555365085602, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 137.703125, "completions/mean_terminated_length": 137.703125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.097471824550716, "grad_norm": 1.9277082681655884, "kl": 4.009765625, "learning_rate": 1.4406653605644836e-05, "loss": 0.1258, "num_tokens": 32741970.0, "reward": -0.733642578125, "reward_std": 0.5251606702804565, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.20166015625, "rewards/ppl_reward/std": 1.8690789937973022, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 143.75, "completions/mean_terminated_length": 143.75, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 2.0986902223576, "grad_norm": 1.6758537292480469, "kl": 4.1474609375, "learning_rate": 1.4399009413159234e-05, "loss": 0.1463, "num_tokens": 32757986.0, "reward": -0.9248046875, "reward_std": 0.4237726330757141, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.623046875, "rewards/ppl_reward/std": 2.342247486114502, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 157.1875, "completions/mean_terminated_length": 157.1875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 2.099908620164484, "grad_norm": 3.545954942703247, "kl": 8.9873046875, "learning_rate": 1.4391362032060152e-05, "loss": 0.3824, "num_tokens": 32775950.0, "reward": -0.5771484375, "reward_std": 0.6776581406593323, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.662109375, "rewards/ppl_reward/std": 2.798348903656006, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.15900352597236633, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 143.609375, "completions/mean_terminated_length": 143.609375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 2.1011270179713675, "grad_norm": 2.721775531768799, "kl": 7.71484375, "learning_rate": 1.4383711467890776e-05, "loss": 0.3277, "num_tokens": 32792261.0, "reward": -2.3013916015625, "reward_std": 1.2924724817276, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.196533203125, "rewards/ppl_reward/std": 5.767910957336426, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 149.609375, "completions/mean_terminated_length": 149.609375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.1023454157782515, "grad_norm": 1.271293044090271, "kl": 1.5810546875, "learning_rate": 1.4376057726196601e-05, "loss": 0.0612, "num_tokens": 32809572.0, "reward": -0.667724609375, "reward_std": 0.22710639238357544, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.24951171875, "rewards/ppl_reward/std": 1.633097529411316, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 146.421875, "completions/mean_terminated_length": 146.421875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.1035638135851356, "grad_norm": 2.9718105792999268, "kl": 7.99609375, "learning_rate": 1.4368400812525434e-05, "loss": 0.3411, "num_tokens": 32825543.0, "reward": -7.4727783203125, "reward_std": 2.3512871265411377, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -18.531494140625, "rewards/ppl_reward/std": 25.773988723754883, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2052978277206421, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 147.640625, "completions/mean_terminated_length": 147.640625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.1047822113920196, "grad_norm": 2.2345144748687744, "kl": 8.01953125, "learning_rate": 1.4360740732427367e-05, "loss": 0.3657, "num_tokens": 32841888.0, "reward": -2.3743896484375, "reward_std": 1.366023063659668, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.287841796875, "rewards/ppl_reward/std": 7.926411151885986, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.17102740705013275, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 151.25, "completions/mean_terminated_length": 151.25, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.1060006091989036, "grad_norm": 4.1728386878967285, "kl": 10.6796875, "learning_rate": 1.4353077491454794e-05, "loss": 0.447, "num_tokens": 32858848.0, "reward": -1.81689453125, "reward_std": 1.6291840076446533, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -7.0009765625, "rewards/ppl_reward/std": 6.43769645690918, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.26325830817222595, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 150.359375, "completions/mean_terminated_length": 150.359375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.1072190070057872, "grad_norm": 1.7742928266525269, "kl": 3.671875, "learning_rate": 1.4345411095162407e-05, "loss": 0.1222, "num_tokens": 32876015.0, "reward": -1.429443359375, "reward_std": 0.7178823947906494, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.62451171875, "rewards/ppl_reward/std": 4.448866367340088, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16194961965084076, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.1084374048126713, "grad_norm": 3.1350855827331543, "kl": 10.703125, "learning_rate": 1.4337741549107174e-05, "loss": 0.5773, "num_tokens": 32892447.0, "reward": -2.02099609375, "reward_std": 1.5425281524658203, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.4873046875, "rewards/ppl_reward/std": 4.881026744842529, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.270231693983078, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 148.46875, "completions/mean_terminated_length": 148.46875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.1096558026195553, "grad_norm": 1.6585302352905273, "kl": 4.39453125, "learning_rate": 1.4330068858848351e-05, "loss": 0.1846, "num_tokens": 32908965.0, "reward": -2.84881591796875, "reward_std": 2.3590450286865234, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -9.3773193359375, "rewards/ppl_reward/std": 13.512539863586426, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1677328199148178, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 149.453125, "completions/mean_terminated_length": 149.453125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 2.1108742004264394, "grad_norm": 2.327657699584961, "kl": 6.15625, "learning_rate": 1.432239302994747e-05, "loss": 0.2869, "num_tokens": 32925482.0, "reward": -1.9365234375, "reward_std": 0.9599679112434387, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.521484375, "rewards/ppl_reward/std": 4.170588493347168, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.14211894571781158, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 152.125, "completions/mean_terminated_length": 152.125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.112092598233323, "grad_norm": 2.635073184967041, "kl": 2.7568359375, "learning_rate": 1.4314714067968347e-05, "loss": 0.066, "num_tokens": 32943186.0, "reward": -1.2589111328125, "reward_std": 0.36623281240463257, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.353759765625, "rewards/ppl_reward/std": 3.327219009399414, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 143.171875, "completions/mean_terminated_length": 143.171875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 2.113310996040207, "grad_norm": 1.7179604768753052, "kl": 3.5283203125, "learning_rate": 1.4307031978477062e-05, "loss": 0.1733, "num_tokens": 32959421.0, "reward": -6.1998291015625, "reward_std": 1.1456186771392822, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -16.235595703125, "rewards/ppl_reward/std": 31.81765365600586, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 128.96875, "completions/mean_terminated_length": 128.96875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.114529393847091, "grad_norm": 1.8538252115249634, "kl": 2.1572265625, "learning_rate": 1.4299346767041956e-05, "loss": 0.0851, "num_tokens": 32974971.0, "reward": -2.85498046875, "reward_std": 1.9832209348678589, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -9.5302734375, "rewards/ppl_reward/std": 13.512777328491211, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 166.484375, "completions/mean_terminated_length": 166.484375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.115747791653975, "grad_norm": 2.2109110355377197, "kl": 4.62890625, "learning_rate": 1.4291658439233645e-05, "loss": 0.1944, "num_tokens": 32993866.0, "reward": -0.77099609375, "reward_std": 0.5321949124336243, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.2529296875, "rewards/ppl_reward/std": 3.6047332286834717, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 133.4375, "completions/mean_terminated_length": 133.4375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 2.116966189460859, "grad_norm": 2.36430025100708, "kl": 2.837890625, "learning_rate": 1.4283967000624996e-05, "loss": 0.0891, "num_tokens": 33009110.0, "reward": -1.81439208984375, "reward_std": 0.9271364808082581, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.2850341796875, "rewards/ppl_reward/std": 9.183005332946777, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.20412415266036987, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 138.84375, "completions/mean_terminated_length": 138.84375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 2.1181845872677427, "grad_norm": 1.7636317014694214, "kl": 2.671875, "learning_rate": 1.4276272456791136e-05, "loss": 0.0455, "num_tokens": 33024588.0, "reward": -0.928955078125, "reward_std": 1.3187730312347412, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.63134765625, "rewards/ppl_reward/std": 3.893413543701172, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 157.328125, "completions/mean_terminated_length": 157.328125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.1194029850746268, "grad_norm": 2.3171050548553467, "kl": 8.21875, "learning_rate": 1.4268574813309442e-05, "loss": 0.4445, "num_tokens": 33042161.0, "reward": -0.886962890625, "reward_std": 0.7233231067657471, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.18798828125, "rewards/ppl_reward/std": 4.447600364685059, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.21532177925109863, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 150.96875, "completions/mean_terminated_length": 150.96875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.120621382881511, "grad_norm": 3.747553825378418, "kl": 5.5625, "learning_rate": 1.426087407575953e-05, "loss": 0.1659, "num_tokens": 33059719.0, "reward": -1.6114501953125, "reward_std": 0.9317579865455627, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.582275390625, "rewards/ppl_reward/std": 3.4122748374938965, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2708333432674408, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 152.953125, "completions/mean_terminated_length": 152.953125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 2.121839780688395, "grad_norm": 5.304819583892822, "kl": 4.546875, "learning_rate": 1.4253170249723269e-05, "loss": 0.1936, "num_tokens": 33077100.0, "reward": -2.1219482421875, "reward_std": 1.3253384828567505, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.861083984375, "rewards/ppl_reward/std": 7.069565773010254, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2237938940525055, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 149.203125, "completions/mean_terminated_length": 149.203125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 2.123058178495279, "grad_norm": 2.4043240547180176, "kl": 2.978515625, "learning_rate": 1.4245463340784761e-05, "loss": 0.1123, "num_tokens": 33094489.0, "reward": -0.86572265625, "reward_std": 0.425045907497406, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.4814453125, "rewards/ppl_reward/std": 3.1035423278808594, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 145.46875, "completions/mean_terminated_length": 145.46875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.1242765763021625, "grad_norm": 4.3131842613220215, "kl": 6.546875, "learning_rate": 1.4237753354530349e-05, "loss": 0.3067, "num_tokens": 33110823.0, "reward": -0.8099365234375, "reward_std": 0.7913920283317566, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.237060546875, "rewards/ppl_reward/std": 2.968975067138672, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1677328199148178, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 128.25, "completions/mean_terminated_length": 128.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.1254949741090465, "grad_norm": 3.1604113578796387, "kl": 4.140625, "learning_rate": 1.4230040296548588e-05, "loss": 0.1165, "num_tokens": 33125487.0, "reward": -1.892333984375, "reward_std": 0.9717133045196533, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.37060546875, "rewards/ppl_reward/std": 5.226814270019531, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21007457375526428, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 148.890625, "completions/mean_terminated_length": 148.890625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 2.1267133719159306, "grad_norm": 1.7938008308410645, "kl": 3.154296875, "learning_rate": 1.4222324172430289e-05, "loss": 0.1209, "num_tokens": 33142496.0, "reward": -1.6754150390625, "reward_std": 1.4857101440429688, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.100830078125, "rewards/ppl_reward/std": 7.927764892578125, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17747680842876434, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 153.890625, "completions/mean_terminated_length": 153.890625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 2.1279317697228146, "grad_norm": 3.200701951980591, "kl": 6.0390625, "learning_rate": 1.4214604987768461e-05, "loss": 0.3915, "num_tokens": 33159985.0, "reward": -1.13134765625, "reward_std": 0.6632151007652283, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.8720703125, "rewards/ppl_reward/std": 2.8914122581481934, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15728822350502014, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 135.796875, "completions/mean_terminated_length": 135.796875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 2.1291501675296987, "grad_norm": 1.593445062637329, "kl": 1.9658203125, "learning_rate": 1.4206882748158341e-05, "loss": 0.05, "num_tokens": 33175500.0, "reward": -1.0870361328125, "reward_std": 0.6622371077537537, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.025634765625, "rewards/ppl_reward/std": 3.852637529373169, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1550549566745758, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 147.234375, "completions/mean_terminated_length": 147.234375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.1303685653365823, "grad_norm": 2.2860851287841797, "kl": 6.09765625, "learning_rate": 1.4199157459197383e-05, "loss": 0.272, "num_tokens": 33192243.0, "reward": -1.883544921875, "reward_std": 0.8863512277603149, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.25927734375, "rewards/ppl_reward/std": 3.9526526927948, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2184663861989975, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 150.640625, "completions/mean_terminated_length": 150.640625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.1315869631434663, "grad_norm": 3.356677770614624, "kl": 6.33203125, "learning_rate": 1.4191429126485248e-05, "loss": 0.4044, "num_tokens": 33208756.0, "reward": -1.3402099609375, "reward_std": 1.0537333488464355, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.266357421875, "rewards/ppl_reward/std": 5.928236484527588, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 140.59375, "completions/mean_terminated_length": 140.59375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.1328053609503503, "grad_norm": 1.7579758167266846, "kl": 5.37890625, "learning_rate": 1.4183697755623796e-05, "loss": 0.2492, "num_tokens": 33224666.0, "reward": -3.0213623046875, "reward_std": 1.3485015630722046, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -9.503662109375, "rewards/ppl_reward/std": 6.863608360290527, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1620931327342987, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 139.421875, "completions/mean_terminated_length": 139.421875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.1340237587572344, "grad_norm": 3.554534673690796, "kl": 2.1904296875, "learning_rate": 1.4175963352217099e-05, "loss": 0.0532, "num_tokens": 33240893.0, "reward": -1.6705322265625, "reward_std": 0.7891226410865784, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.177001953125, "rewards/ppl_reward/std": 6.123800754547119, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1118449866771698, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 126.59375, "completions/mean_terminated_length": 126.59375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.135242156564118, "grad_norm": 1.5524762868881226, "kl": 2.640625, "learning_rate": 1.4168225921871433e-05, "loss": 0.0549, "num_tokens": 33255371.0, "reward": -1.4193115234375, "reward_std": 0.6639153957366943, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.541748046875, "rewards/ppl_reward/std": 3.129218578338623, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 133.375, "completions/mean_terminated_length": 133.375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 2.136460554371002, "grad_norm": 1.5644723176956177, "kl": 4.32421875, "learning_rate": 1.4160485470195245e-05, "loss": 0.2131, "num_tokens": 33270771.0, "reward": -1.266357421875, "reward_std": 0.552130937576294, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.30615234375, "rewards/ppl_reward/std": 3.0461807250976562, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 129.15625, "completions/mean_terminated_length": 129.15625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 2.137678952177886, "grad_norm": 2.2952864170074463, "kl": 4.671875, "learning_rate": 1.4152742002799196e-05, "loss": 0.1629, "num_tokens": 33285981.0, "reward": -4.994873046875, "reward_std": 1.8494712114334106, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -13.59130859375, "rewards/ppl_reward/std": 15.946953773498535, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1876239776611328, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 121.671875, "completions/mean_terminated_length": 121.671875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.13889734998477, "grad_norm": 2.7866103649139404, "kl": 6.81640625, "learning_rate": 1.4144995525296124e-05, "loss": 0.418, "num_tokens": 33300104.0, "reward": -3.3394775390625, "reward_std": 1.4770891666412354, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -10.428955078125, "rewards/ppl_reward/std": 11.025197982788086, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 138.5, "completions/mean_terminated_length": 138.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 2.140115747791654, "grad_norm": 3.257434606552124, "kl": 10.3125, "learning_rate": 1.4137246043301042e-05, "loss": 0.5693, "num_tokens": 33315848.0, "reward": -1.581787109375, "reward_std": 1.123052954673767, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.52294921875, "rewards/ppl_reward/std": 2.9707283973693848, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.24384792149066925, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 156.109375, "completions/mean_terminated_length": 156.109375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.1413341455985377, "grad_norm": 1.606469750404358, "kl": 7.12890625, "learning_rate": 1.4129493562431156e-05, "loss": 0.3939, "num_tokens": 33332831.0, "reward": -0.07537841796875, "reward_std": 0.5651966333389282, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -3.8538818359375, "rewards/ppl_reward/std": 2.073005199432373, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 126.125, "completions/mean_terminated_length": 126.125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.142552543405422, "grad_norm": 3.674333095550537, "kl": 6.806640625, "learning_rate": 1.4121738088305827e-05, "loss": 0.3879, "num_tokens": 33347279.0, "reward": -2.25390625, "reward_std": 1.141451358795166, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.25, "rewards/ppl_reward/std": 4.239558696746826, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 142.828125, "completions/mean_terminated_length": 142.828125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.143770941212306, "grad_norm": 3.3052310943603516, "kl": 7.10546875, "learning_rate": 1.4113979626546602e-05, "loss": 0.3361, "num_tokens": 33363820.0, "reward": -0.88909912109375, "reward_std": 0.5540233850479126, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.5047607421875, "rewards/ppl_reward/std": 2.668649673461914, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 162.453125, "completions/mean_terminated_length": 162.453125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.14498933901919, "grad_norm": 8.578070640563965, "kl": 17.3671875, "learning_rate": 1.4106218182777182e-05, "loss": 0.9166, "num_tokens": 33381129.0, "reward": -1.7840576171875, "reward_std": 1.224625825881958, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.982177734375, "rewards/ppl_reward/std": 4.179508686065674, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.23302355408668518, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 146.296875, "completions/mean_terminated_length": 146.296875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.146207736826074, "grad_norm": 2.0188801288604736, "kl": 6.68359375, "learning_rate": 1.4098453762623443e-05, "loss": 0.4109, "num_tokens": 33397396.0, "reward": -1.16259765625, "reward_std": 0.48571479320526123, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.1376953125, "rewards/ppl_reward/std": 2.401989459991455, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 153.5625, "completions/mean_terminated_length": 153.5625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.1474261346329575, "grad_norm": 2.953538417816162, "kl": 7.17578125, "learning_rate": 1.4090686371713403e-05, "loss": 0.3714, "num_tokens": 33414968.0, "reward": -0.8116455078125, "reward_std": 0.4286159574985504, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.263916015625, "rewards/ppl_reward/std": 1.678785800933838, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 147.28125, "completions/mean_terminated_length": 147.28125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.1486445324398415, "grad_norm": 2.657768487930298, "kl": 5.7568359375, "learning_rate": 1.4082916015677237e-05, "loss": 0.2677, "num_tokens": 33431378.0, "reward": -2.389404296875, "reward_std": 0.48215150833129883, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -8.65380859375, "rewards/ppl_reward/std": 7.678672790527344, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 152.75, "completions/mean_terminated_length": 152.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 2.1498629302467256, "grad_norm": 2.572502613067627, "kl": 10.0546875, "learning_rate": 1.4075142700147282e-05, "loss": 0.5043, "num_tokens": 33448234.0, "reward": -0.94720458984375, "reward_std": 1.1756904125213623, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.3319091796875, "rewards/ppl_reward/std": 3.8448104858398438, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.22493386268615723, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 157.078125, "completions/mean_terminated_length": 157.078125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.1510813280536096, "grad_norm": 2.1560821533203125, "kl": 5.8447265625, "learning_rate": 1.4067366430758004e-05, "loss": 0.3532, "num_tokens": 33465831.0, "reward": -1.792724609375, "reward_std": 0.8493289351463318, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.19482421875, "rewards/ppl_reward/std": 3.2524709701538086, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 163.4375, "completions/mean_terminated_length": 149.7777862548828, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.1522997258604937, "grad_norm": 3.4721314907073975, "kl": 7.984375, "learning_rate": 1.405958721314602e-05, "loss": 0.5249, "num_tokens": 33482931.0, "reward": -1.795166015625, "reward_std": 0.7909374237060547, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.23876953125, "rewards/ppl_reward/std": 1.938822627067566, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15545432269573212, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 144.703125, "completions/mean_terminated_length": 144.703125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 2.1535181236673773, "grad_norm": 3.4517786502838135, "kl": 4.55078125, "learning_rate": 1.4051805052950078e-05, "loss": 0.1674, "num_tokens": 33499248.0, "reward": -1.91455078125, "reward_std": 0.5353562831878662, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.5087890625, "rewards/ppl_reward/std": 3.5792481899261475, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 134.84375, "completions/mean_terminated_length": 134.84375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 2.1547365214742613, "grad_norm": 1.585288405418396, "kl": 4.79296875, "learning_rate": 1.4044019955811066e-05, "loss": 0.1714, "num_tokens": 33514622.0, "reward": -1.5751953125, "reward_std": 0.9740942120552063, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.791015625, "rewards/ppl_reward/std": 5.382632255554199, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 138.734375, "completions/mean_terminated_length": 138.734375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.1559549192811454, "grad_norm": 1.6985903978347778, "kl": 3.2236328125, "learning_rate": 1.4036231927371992e-05, "loss": 0.1099, "num_tokens": 33530845.0, "reward": -1.6982421875, "reward_std": 0.4703424870967865, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.154296875, "rewards/ppl_reward/std": 5.484394073486328, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 141.6875, "completions/mean_terminated_length": 141.6875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.1571733170880294, "grad_norm": 3.246314764022827, "kl": 7.046875, "learning_rate": 1.4028440973277998e-05, "loss": 0.4072, "num_tokens": 33546113.0, "reward": -0.6427001953125, "reward_std": 0.9093523025512695, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.715087890625, "rewards/ppl_reward/std": 1.7691185474395752, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.22100594639778137, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 141.921875, "completions/mean_terminated_length": 141.921875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 2.158391714894913, "grad_norm": 2.0067591667175293, "kl": 5.23828125, "learning_rate": 1.402064709917634e-05, "loss": 0.157, "num_tokens": 33562236.0, "reward": -0.4671630859375, "reward_std": 0.9724655747413635, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.465576171875, "rewards/ppl_reward/std": 2.0490996837615967, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.2539372742176056, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 151.4375, "completions/mean_terminated_length": 151.4375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.159610112701797, "grad_norm": 2.411182165145874, "kl": 4.63671875, "learning_rate": 1.401285031071639e-05, "loss": 0.1778, "num_tokens": 33578872.0, "reward": -0.8111572265625, "reward_std": 0.46775418519973755, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.333251953125, "rewards/ppl_reward/std": 2.0707359313964844, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11672777682542801, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 146.703125, "completions/mean_terminated_length": 146.703125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.160828510508681, "grad_norm": 1.8692800998687744, "kl": 4.390625, "learning_rate": 1.4005050613549637e-05, "loss": 0.2315, "num_tokens": 33595029.0, "reward": -0.3505859375, "reward_std": 0.4681814908981323, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.521484375, "rewards/ppl_reward/std": 1.9723635911941528, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 118.984375, "completions/mean_terminated_length": 118.984375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.162046908315565, "grad_norm": 2.219650983810425, "kl": 4.4052734375, "learning_rate": 1.3997248013329681e-05, "loss": 0.062, "num_tokens": 33608980.0, "reward": -2.5908203125, "reward_std": 1.8860806226730347, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.791015625, "rewards/ppl_reward/std": 10.251871109008789, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 153.3125, "completions/mean_terminated_length": 153.3125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 2.163265306122449, "grad_norm": 1.6185482740402222, "kl": 7.376953125, "learning_rate": 1.3989442515712215e-05, "loss": 0.3802, "num_tokens": 33625984.0, "reward": -1.236083984375, "reward_std": 1.0074574947357178, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.08935546875, "rewards/ppl_reward/std": 3.6256697177886963, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21007457375526428, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 154.03125, "completions/mean_terminated_length": 154.03125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 2.1644837039293328, "grad_norm": 1.703794002532959, "kl": 6.5234375, "learning_rate": 1.3981634126355039e-05, "loss": 0.2903, "num_tokens": 33643250.0, "reward": -0.49334716796875, "reward_std": 0.5879985690116882, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.6195068359375, "rewards/ppl_reward/std": 2.168370008468628, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18225978314876556, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 153.859375, "completions/mean_terminated_length": 153.859375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 2.165702101736217, "grad_norm": 5.885637283325195, "kl": 8.12109375, "learning_rate": 1.3973822850918055e-05, "loss": 0.4398, "num_tokens": 33659681.0, "reward": -0.6435546875, "reward_std": 0.6414900422096252, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.951171875, "rewards/ppl_reward/std": 1.5547688007354736, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.2028672844171524, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 148.21875, "completions/mean_terminated_length": 148.21875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.166920499543101, "grad_norm": 2.0246005058288574, "kl": 5.93359375, "learning_rate": 1.3966008695063244e-05, "loss": 0.3269, "num_tokens": 33675631.0, "reward": -1.569580078125, "reward_std": 0.7091768980026245, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.95166015625, "rewards/ppl_reward/std": 4.79396390914917, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 141.234375, "completions/mean_terminated_length": 141.234375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.168138897349985, "grad_norm": 2.1689369678497314, "kl": 3.3046875, "learning_rate": 1.3958191664454687e-05, "loss": 0.0791, "num_tokens": 33691614.0, "reward": -0.861572265625, "reward_std": 0.375278502702713, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.51220703125, "rewards/ppl_reward/std": 2.9207701683044434, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 148.265625, "completions/mean_terminated_length": 148.265625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.169357295156869, "grad_norm": 1.3756054639816284, "kl": 4.4873046875, "learning_rate": 1.3950371764758543e-05, "loss": 0.175, "num_tokens": 33708615.0, "reward": -1.0283203125, "reward_std": 0.416816771030426, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.806640625, "rewards/ppl_reward/std": 1.9562753438949585, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 148.5, "completions/mean_terminated_length": 148.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 2.1705756929637525, "grad_norm": 1.4411735534667969, "kl": 2.50390625, "learning_rate": 1.3942549001643047e-05, "loss": 0.0317, "num_tokens": 33725719.0, "reward": -0.41363525390625, "reward_std": 1.6609878540039062, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.6397705078125, "rewards/ppl_reward/std": 7.071939468383789, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 155.828125, "completions/mean_terminated_length": 155.828125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.1717940907706366, "grad_norm": 1.5960652828216553, "kl": 4.96484375, "learning_rate": 1.3934723380778517e-05, "loss": 0.2532, "num_tokens": 33743412.0, "reward": -0.1439208984375, "reward_std": 0.49076002836227417, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -3.975341796875, "rewards/ppl_reward/std": 1.0729557275772095, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 150.84375, "completions/mean_terminated_length": 150.84375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.1730124885775206, "grad_norm": 1.8782305717468262, "kl": 8.43359375, "learning_rate": 1.392689490783734e-05, "loss": 0.4685, "num_tokens": 33759986.0, "reward": -1.17333984375, "reward_std": 1.4068324565887451, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.7998046875, "rewards/ppl_reward/std": 4.202301502227783, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2236899733543396, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 137.546875, "completions/mean_terminated_length": 137.546875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 2.1742308863844046, "grad_norm": 1.5781465768814087, "kl": 3.462890625, "learning_rate": 1.3919063588493971e-05, "loss": 0.0463, "num_tokens": 33775269.0, "reward": -0.9046630859375, "reward_std": 0.6227259635925293, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.660888671875, "rewards/ppl_reward/std": 3.680730104446411, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 137.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.1754492841912887, "grad_norm": 2.3509883880615234, "kl": 4.44921875, "learning_rate": 1.3911229428424919e-05, "loss": 0.1289, "num_tokens": 33790845.0, "reward": -1.449951171875, "reward_std": 0.6233565807342529, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.50927734375, "rewards/ppl_reward/std": 3.6327524185180664, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1508491188287735, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 164.765625, "completions/mean_terminated_length": 164.765625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.1766676819981723, "grad_norm": 2.926663875579834, "kl": 7.43359375, "learning_rate": 1.3903392433308765e-05, "loss": 0.3143, "num_tokens": 33808782.0, "reward": -3.274169921875, "reward_std": 1.8345582485198975, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -9.86865234375, "rewards/ppl_reward/std": 8.36306095123291, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.1960279643535614, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 140.5625, "completions/mean_terminated_length": 140.5625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 2.1778860798050563, "grad_norm": 2.129638910293579, "kl": 3.078125, "learning_rate": 1.3895552608826144e-05, "loss": 0.1156, "num_tokens": 33824770.0, "reward": -2.096435546875, "reward_std": 0.83333420753479, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.85693359375, "rewards/ppl_reward/std": 8.653908729553223, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 170.203125, "completions/mean_terminated_length": 170.203125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 2.1791044776119404, "grad_norm": 5.071650505065918, "kl": 6.6484375, "learning_rate": 1.3887709960659729e-05, "loss": 0.3458, "num_tokens": 33842903.0, "reward": -0.2874755859375, "reward_std": 0.3572145104408264, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -4.184326171875, "rewards/ppl_reward/std": 1.2955526113510132, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 160.515625, "completions/mean_terminated_length": 160.515625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 2.1803228754188244, "grad_norm": 2.111018657684326, "kl": 6.81640625, "learning_rate": 1.3879864494494252e-05, "loss": 0.2932, "num_tokens": 33861048.0, "reward": -0.603759765625, "reward_std": 0.6022096872329712, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -4.76220703125, "rewards/ppl_reward/std": 1.5544222593307495, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 148.59375, "completions/mean_terminated_length": 148.59375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 2.181541273225708, "grad_norm": 2.2004997730255127, "kl": 4.98828125, "learning_rate": 1.3872016216016489e-05, "loss": 0.1547, "num_tokens": 33878006.0, "reward": -1.2685546875, "reward_std": 0.9390226602554321, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.201171875, "rewards/ppl_reward/std": 3.4408605098724365, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 175.703125, "completions/mean_terminated_length": 175.703125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 2.182759671032592, "grad_norm": 3.383892297744751, "kl": 5.05859375, "learning_rate": 1.3864165130915242e-05, "loss": 0.3127, "num_tokens": 33897611.0, "reward": -0.1834716796875, "reward_std": 0.4446815550327301, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.062255859375, "rewards/ppl_reward/std": 1.4422651529312134, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 154.0625, "completions/mean_terminated_length": 154.0625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.183978068839476, "grad_norm": 2.9918768405914307, "kl": 4.796875, "learning_rate": 1.385631124488136e-05, "loss": 0.1729, "num_tokens": 33915231.0, "reward": -1.434814453125, "reward_std": 0.5808061361312866, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.49462890625, "rewards/ppl_reward/std": 3.5278127193450928, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1326993852853775, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 156.796875, "completions/mean_terminated_length": 156.796875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 2.18519646664636, "grad_norm": 1.8757059574127197, "kl": 2.822265625, "learning_rate": 1.3848454563607714e-05, "loss": 0.0578, "num_tokens": 33932426.0, "reward": -3.136474609375, "reward_std": 0.7434186935424805, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -9.99169921875, "rewards/ppl_reward/std": 6.9957170486450195, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 157.5, "completions/mean_terminated_length": 157.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.186414864453244, "grad_norm": 3.8495569229125977, "kl": 4.3046875, "learning_rate": 1.3840595092789204e-05, "loss": 0.2871, "num_tokens": 33949946.0, "reward": -2.4903564453125, "reward_std": 0.930277407169342, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.707275390625, "rewards/ppl_reward/std": 6.934742450714111, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14471296966075897, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 140.890625, "completions/mean_terminated_length": 140.890625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.1876332622601278, "grad_norm": 1.9937397241592407, "kl": 5.22265625, "learning_rate": 1.3832732838122753e-05, "loss": 0.1804, "num_tokens": 33965563.0, "reward": -2.42822265625, "reward_std": 2.2339768409729004, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -8.3564453125, "rewards/ppl_reward/std": 13.04360294342041, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1666666716337204, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 149.484375, "completions/mean_terminated_length": 149.484375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 2.188851660067012, "grad_norm": 1.3618769645690918, "kl": 3.1943359375, "learning_rate": 1.3824867805307305e-05, "loss": 0.1352, "num_tokens": 33982090.0, "reward": -2.803466796875, "reward_std": 0.5801594257354736, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -9.29443359375, "rewards/ppl_reward/std": 3.5791730880737305, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 143.78125, "completions/mean_terminated_length": 143.78125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.190070057873896, "grad_norm": 1.5237677097320557, "kl": 3.76171875, "learning_rate": 1.381700000004381e-05, "loss": 0.1061, "num_tokens": 33998348.0, "reward": -1.2093505859375, "reward_std": 1.1896551847457886, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.989013671875, "rewards/ppl_reward/std": 3.6270785331726074, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2212863266468048, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 146.109375, "completions/mean_terminated_length": 146.109375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.19128845568078, "grad_norm": 2.3874335289001465, "kl": 4.0859375, "learning_rate": 1.3809129428035229e-05, "loss": 0.1604, "num_tokens": 34014907.0, "reward": -1.4471435546875, "reward_std": 0.543570876121521, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.652099609375, "rewards/ppl_reward/std": 4.677767276763916, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 154.671875, "completions/mean_terminated_length": 154.671875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.1925068534876635, "grad_norm": 2.2962141036987305, "kl": 6.21484375, "learning_rate": 1.3801256094986536e-05, "loss": 0.2219, "num_tokens": 34032398.0, "reward": -1.599853515625, "reward_std": 0.5320079326629639, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.80908203125, "rewards/ppl_reward/std": 3.645958185195923, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15728822350502014, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 149.484375, "completions/mean_terminated_length": 149.484375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.1937252512945475, "grad_norm": 2.481077194213867, "kl": 5.21484375, "learning_rate": 1.37933800066047e-05, "loss": 0.1589, "num_tokens": 34049173.0, "reward": -1.90966796875, "reward_std": 1.723233938217163, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.3818359375, "rewards/ppl_reward/std": 7.725926399230957, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.23517554998397827, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 136.484375, "completions/mean_terminated_length": 136.484375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.1949436491014316, "grad_norm": 2.0293753147125244, "kl": 4.6796875, "learning_rate": 1.3785501168598688e-05, "loss": 0.15, "num_tokens": 34064724.0, "reward": -1.24609375, "reward_std": 0.640535831451416, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.1484375, "rewards/ppl_reward/std": 2.9836158752441406, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 139.765625, "completions/mean_terminated_length": 139.765625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 2.1961620469083156, "grad_norm": 4.525181770324707, "kl": 8.515625, "learning_rate": 1.3777619586679458e-05, "loss": 0.3003, "num_tokens": 34080789.0, "reward": -2.64501953125, "reward_std": 1.0113836526870728, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -8.6181640625, "rewards/ppl_reward/std": 7.940977096557617, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.21921011805534363, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 136.875, "completions/mean_terminated_length": 136.875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.1973804447151997, "grad_norm": 5.712896823883057, "kl": 10.0078125, "learning_rate": 1.3769735266559963e-05, "loss": 0.423, "num_tokens": 34096789.0, "reward": -0.262451171875, "reward_std": 0.7174280285835266, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -3.93115234375, "rewards/ppl_reward/std": 1.5718389749526978, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.16060402989387512, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 139.375, "completions/mean_terminated_length": 139.375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.1985988425220833, "grad_norm": 3.119143009185791, "kl": 9.078125, "learning_rate": 1.3761848213955136e-05, "loss": 0.4545, "num_tokens": 34112981.0, "reward": -1.2913818359375, "reward_std": 1.0624573230743408, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.981201171875, "rewards/ppl_reward/std": 3.647571086883545, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.22980836033821106, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 118.6875, "completions/mean_terminated_length": 118.6875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 2.1998172403289673, "grad_norm": 1.718849539756775, "kl": 3.138671875, "learning_rate": 1.3753958434581893e-05, "loss": 0.0304, "num_tokens": 34127345.0, "reward": -0.9798583984375, "reward_std": 0.8629521131515503, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.686279296875, "rewards/ppl_reward/std": 4.160835266113281, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 141.21875, "completions/mean_terminated_length": 141.21875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.2010356381358513, "grad_norm": 3.392307996749878, "kl": 8.6796875, "learning_rate": 1.3746065934159123e-05, "loss": 0.4476, "num_tokens": 34143463.0, "reward": -2.01611328125, "reward_std": 0.7246639728546143, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -7.3681640625, "rewards/ppl_reward/std": 3.847646474838257, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.18926911056041718, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 135.546875, "completions/mean_terminated_length": 135.546875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.2022540359427354, "grad_norm": 1.9637360572814941, "kl": 2.541015625, "learning_rate": 1.3738170718407689e-05, "loss": 0.0573, "num_tokens": 34159978.0, "reward": -0.6669921875, "reward_std": 0.9052129983901978, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.044921875, "rewards/ppl_reward/std": 2.93717098236084, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.20009763538837433, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 126.921875, "completions/mean_terminated_length": 126.921875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 2.2034724337496194, "grad_norm": 2.8351221084594727, "kl": 6.28125, "learning_rate": 1.3730272793050426e-05, "loss": 0.2438, "num_tokens": 34174885.0, "reward": -1.304931640625, "reward_std": 0.9635518789291382, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.05517578125, "rewards/ppl_reward/std": 2.4998154640197754, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 2.204690831556503, "grad_norm": 2.8779852390289307, "kl": 3.83984375, "learning_rate": 1.3722372163812133e-05, "loss": 0.0689, "num_tokens": 34190165.0, "reward": -2.058349609375, "reward_std": 0.6104761362075806, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.76513671875, "rewards/ppl_reward/std": 4.071766376495361, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15545432269573212, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 126.671875, "completions/mean_terminated_length": 126.671875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 2.205909229363387, "grad_norm": 2.4743731021881104, "kl": 4.921875, "learning_rate": 1.3714468836419555e-05, "loss": 0.2451, "num_tokens": 34205024.0, "reward": -0.341552734375, "reward_std": 0.5186541080474854, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -4.12060546875, "rewards/ppl_reward/std": 1.38135826587677, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14689241349697113, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 114.203125, "completions/mean_terminated_length": 114.203125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 2.207127627170271, "grad_norm": 3.7976367473602295, "kl": 3.1328125, "learning_rate": 1.370656281660141e-05, "loss": 0.0624, "num_tokens": 34218933.0, "reward": -2.35546875, "reward_std": 1.5490597486495972, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.375, "rewards/ppl_reward/std": 5.99948787689209, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.2076999396085739, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 125.9375, "completions/mean_terminated_length": 125.9375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.208346024977155, "grad_norm": 2.422309160232544, "kl": 5.5546875, "learning_rate": 1.3698654110088365e-05, "loss": 0.2806, "num_tokens": 34233753.0, "reward": -2.331298828125, "reward_std": 0.7615050077438354, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.31103515625, "rewards/ppl_reward/std": 8.111175537109375, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09827063977718353, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 126.609375, "completions/mean_terminated_length": 126.609375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 2.209564422784039, "grad_norm": 2.5108449459075928, "kl": 3.671875, "learning_rate": 1.3690742722613026e-05, "loss": 0.1454, "num_tokens": 34248544.0, "reward": -1.3760986328125, "reward_std": 0.9980607628822327, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.439697265625, "rewards/ppl_reward/std": 3.095898389816284, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17747680842876434, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 147.40625, "completions/mean_terminated_length": 147.40625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.210782820590923, "grad_norm": 4.232353210449219, "kl": 5.033203125, "learning_rate": 1.3682828659909948e-05, "loss": 0.4433, "num_tokens": 34264962.0, "reward": -2.3133544921875, "reward_std": 0.4725577235221863, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -8.501708984375, "rewards/ppl_reward/std": 6.058379173278809, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 117.5, "completions/mean_terminated_length": 117.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.212001218397807, "grad_norm": 2.635298013687134, "kl": 3.37109375, "learning_rate": 1.3674911927715626e-05, "loss": 0.1751, "num_tokens": 34278810.0, "reward": -1.22332763671875, "reward_std": 0.7445793151855469, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.2200927734375, "rewards/ppl_reward/std": 4.501760482788086, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 133.734375, "completions/mean_terminated_length": 133.734375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.213219616204691, "grad_norm": 4.385509014129639, "kl": 3.6640625, "learning_rate": 1.3666992531768482e-05, "loss": 0.1083, "num_tokens": 34294921.0, "reward": -2.0452880859375, "reward_std": 0.6607487797737122, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.778076171875, "rewards/ppl_reward/std": 6.936117649078369, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 125.921875, "completions/mean_terminated_length": 125.921875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 2.214438014011575, "grad_norm": 2.3990426063537598, "kl": 5.7578125, "learning_rate": 1.365907047780888e-05, "loss": 0.2082, "num_tokens": 34310004.0, "reward": -1.193359375, "reward_std": 1.0590391159057617, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.05078125, "rewards/ppl_reward/std": 2.5790109634399414, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 126.921875, "completions/mean_terminated_length": 126.921875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.2156564118184585, "grad_norm": 4.1527252197265625, "kl": 6.53515625, "learning_rate": 1.3651145771579107e-05, "loss": 0.2775, "num_tokens": 34325111.0, "reward": -2.343505859375, "reward_std": 2.193488359451294, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.18701171875, "rewards/ppl_reward/std": 9.341959953308105, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.2459997534751892, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 141.6875, "completions/mean_terminated_length": 141.6875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.2168748096253426, "grad_norm": 5.119863986968994, "kl": 11.265625, "learning_rate": 1.3643218418823367e-05, "loss": 0.5216, "num_tokens": 34341739.0, "reward": -1.7454833984375, "reward_std": 1.4001295566558838, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -6.756591796875, "rewards/ppl_reward/std": 3.4462943077087402, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.17938801646232605, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 132.046875, "completions/mean_terminated_length": 132.046875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.2180932074322266, "grad_norm": 5.985520839691162, "kl": 9.4296875, "learning_rate": 1.3635288425287784e-05, "loss": 0.4466, "num_tokens": 34357198.0, "reward": -2.74932861328125, "reward_std": 1.176208257675171, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -8.9439697265625, "rewards/ppl_reward/std": 11.915346145629883, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2025614231824875, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 121.015625, "completions/mean_terminated_length": 121.015625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.2193116052391106, "grad_norm": 9.796661376953125, "kl": 15.5, "learning_rate": 1.3627355796720408e-05, "loss": 0.7062, "num_tokens": 34371599.0, "reward": -1.9310302734375, "reward_std": 1.850517988204956, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.467176616191864, "rewards/ppl_reward/mean": -6.955810546875, "rewards/ppl_reward/std": 6.445125579833984, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.2630521357059479, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 129.9375, "completions/mean_terminated_length": 129.9375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.2205300030459947, "grad_norm": 3.4267966747283936, "kl": 7.40625, "learning_rate": 1.361942053887118e-05, "loss": 0.3641, "num_tokens": 34387827.0, "reward": -0.9049072265625, "reward_std": 0.477505087852478, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.458251953125, "rewards/ppl_reward/std": 2.059621572494507, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11672777682542801, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 121.359375, "completions/mean_terminated_length": 121.359375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 2.2217484008528783, "grad_norm": 3.658665657043457, "kl": 8.22265625, "learning_rate": 1.3611482657491964e-05, "loss": 0.4737, "num_tokens": 34402066.0, "reward": -3.7476806640625, "reward_std": 4.437438488006592, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -11.034423828125, "rewards/ppl_reward/std": 20.569578170776367, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18225978314876556, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 122.078125, "completions/mean_terminated_length": 122.078125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.2229667986597623, "grad_norm": 3.0237717628479004, "kl": 7.46875, "learning_rate": 1.3603542158336509e-05, "loss": 0.363, "num_tokens": 34416975.0, "reward": -4.4716796875, "reward_std": 1.9900490045547485, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -12.466796875, "rewards/ppl_reward/std": 19.042194366455078, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19024936854839325, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 122.9375, "completions/mean_terminated_length": 122.9375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 2.2241851964666464, "grad_norm": 2.9205048084259033, "kl": 7.15625, "learning_rate": 1.359559904716048e-05, "loss": 0.3453, "num_tokens": 34432275.0, "reward": -0.4324951171875, "reward_std": 0.5777076482772827, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.372802734375, "rewards/ppl_reward/std": 2.5899674892425537, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.15900352597236633, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 111.828125, "completions/mean_terminated_length": 111.828125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.2254035942735304, "grad_norm": 2.6810712814331055, "kl": 3.3828125, "learning_rate": 1.358765332972142e-05, "loss": 0.1041, "num_tokens": 34446000.0, "reward": -1.10205078125, "reward_std": 0.6772615909576416, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.9619140625, "rewards/ppl_reward/std": 3.123626947402954, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 116.203125, "completions/mean_terminated_length": 116.203125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.2266219920804144, "grad_norm": 3.1898353099823, "kl": 3.228515625, "learning_rate": 1.3579705011778767e-05, "loss": 0.1666, "num_tokens": 34460653.0, "reward": -1.79248046875, "reward_std": 0.8473032116889954, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.4052734375, "rewards/ppl_reward/std": 4.5475568771362305, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 113.953125, "completions/mean_terminated_length": 113.953125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.227840389887298, "grad_norm": 2.2530784606933594, "kl": 3.658203125, "learning_rate": 1.3571754099093848e-05, "loss": 0.0308, "num_tokens": 34474882.0, "reward": -1.99267578125, "reward_std": 1.2164306640625, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.5947265625, "rewards/ppl_reward/std": 4.037351608276367, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 114.21875, "completions/mean_terminated_length": 114.21875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.229058787694182, "grad_norm": 2.885599136352539, "kl": 2.865234375, "learning_rate": 1.3563800597429862e-05, "loss": 0.1079, "num_tokens": 34489192.0, "reward": -2.6009521484375, "reward_std": 0.9017317295074463, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.936279296875, "rewards/ppl_reward/std": 5.870107173919678, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.17938801646232605, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 138.078125, "completions/mean_terminated_length": 138.078125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.230277185501066, "grad_norm": 1.7782899141311646, "kl": 2.314453125, "learning_rate": 1.3555844512551891e-05, "loss": 0.0783, "num_tokens": 34504533.0, "reward": -1.13775634765625, "reward_std": 0.3934730589389801, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.0723876953125, "rewards/ppl_reward/std": 5.444852352142334, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 124.765625, "completions/mean_terminated_length": 124.765625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.23149558330795, "grad_norm": 2.523085832595825, "kl": 5.48046875, "learning_rate": 1.354788585022689e-05, "loss": 0.1879, "num_tokens": 34519950.0, "reward": -1.449951171875, "reward_std": 0.7971048355102539, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.52490234375, "rewards/ppl_reward/std": 4.4280924797058105, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1326993852853775, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 134.75, "completions/mean_terminated_length": 134.75, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.232713981114834, "grad_norm": 3.4179117679595947, "kl": 5.04296875, "learning_rate": 1.3539924616223679e-05, "loss": 0.2615, "num_tokens": 34536310.0, "reward": -0.64532470703125, "reward_std": 0.7104794979095459, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.0015869140625, "rewards/ppl_reward/std": 2.8303310871124268, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 117.75, "completions/mean_terminated_length": 117.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.233932378921718, "grad_norm": 1.9898109436035156, "kl": 5.28125, "learning_rate": 1.3531960816312938e-05, "loss": 0.1094, "num_tokens": 34550590.0, "reward": -1.8914794921875, "reward_std": 1.7435029745101929, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -7.189208984375, "rewards/ppl_reward/std": 5.208159923553467, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.233588308095932, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 114.453125, "completions/mean_terminated_length": 114.453125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 2.235150776728602, "grad_norm": 1.7456042766571045, "kl": 4.064453125, "learning_rate": 1.352399445626722e-05, "loss": 0.1205, "num_tokens": 34564507.0, "reward": -1.3487548828125, "reward_std": 0.6592780947685242, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.400634765625, "rewards/ppl_reward/std": 3.589123487472534, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.21578919887542725, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 128.265625, "completions/mean_terminated_length": 128.265625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.236369174535486, "grad_norm": 3.2498772144317627, "kl": 7.14453125, "learning_rate": 1.351602554186092e-05, "loss": 0.2926, "num_tokens": 34579916.0, "reward": -0.677001953125, "reward_std": 0.6440038084983826, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.87744140625, "rewards/ppl_reward/std": 2.0607259273529053, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.11935414373874664, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 134.609375, "completions/mean_terminated_length": 134.609375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 2.23758757234237, "grad_norm": 2.4426422119140625, "kl": 6.89453125, "learning_rate": 1.3508054078870293e-05, "loss": 0.3437, "num_tokens": 34595939.0, "reward": -1.6544189453125, "reward_std": 0.6173149943351746, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.863525390625, "rewards/ppl_reward/std": 3.3540804386138916, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 118.359375, "completions/mean_terminated_length": 118.359375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 2.2388059701492535, "grad_norm": 3.6170151233673096, "kl": 7.9296875, "learning_rate": 1.3500080073073436e-05, "loss": 0.2883, "num_tokens": 34610250.0, "reward": -1.220458984375, "reward_std": 0.8695386648178101, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.01123046875, "rewards/ppl_reward/std": 4.987405300140381, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1767328530550003, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 130.453125, "completions/mean_terminated_length": 130.453125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 2.2400243679561376, "grad_norm": 2.641625165939331, "kl": 9.14453125, "learning_rate": 1.3492103530250296e-05, "loss": 0.4025, "num_tokens": 34625183.0, "reward": -0.8536376953125, "reward_std": 1.043731689453125, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.215087890625, "rewards/ppl_reward/std": 1.731338620185852, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.24138814210891724, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 126.203125, "completions/mean_terminated_length": 126.203125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.2412427657630216, "grad_norm": 3.6679182052612305, "kl": 7.3359375, "learning_rate": 1.348412445618265e-05, "loss": 0.2028, "num_tokens": 34640004.0, "reward": -1.303955078125, "reward_std": 1.166473627090454, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.05322265625, "rewards/ppl_reward/std": 3.247744083404541, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.245463564991951, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 116.03125, "completions/mean_terminated_length": 116.03125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 2.2424611635699057, "grad_norm": 2.7304515838623047, "kl": 6.267578125, "learning_rate": 1.347614285665412e-05, "loss": 0.1806, "num_tokens": 34653886.0, "reward": -0.871337890625, "reward_std": 0.9629929065704346, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.26611328125, "rewards/ppl_reward/std": 3.8673088550567627, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21007457375526428, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 132.421875, "completions/mean_terminated_length": 132.421875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 2.2436795613767897, "grad_norm": 2.7019033432006836, "kl": 5.26953125, "learning_rate": 1.3468158737450148e-05, "loss": 0.1491, "num_tokens": 34669641.0, "reward": -1.1318359375, "reward_std": 0.8098396062850952, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.865234375, "rewards/ppl_reward/std": 2.990978479385376, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 131.359375, "completions/mean_terminated_length": 131.359375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.2448979591836733, "grad_norm": 2.2177767753601074, "kl": 3.3203125, "learning_rate": 1.3460172104358007e-05, "loss": 0.0476, "num_tokens": 34684720.0, "reward": -2.4093017578125, "reward_std": 0.7151197195053101, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.568603515625, "rewards/ppl_reward/std": 4.875856876373291, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 132.578125, "completions/mean_terminated_length": 132.578125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 2.2461163569905573, "grad_norm": 2.3612349033355713, "kl": 4.0625, "learning_rate": 1.3452182963166792e-05, "loss": 0.1804, "num_tokens": 34700877.0, "reward": -0.7030029296875, "reward_std": 0.6898766160011292, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.023193359375, "rewards/ppl_reward/std": 2.358503580093384, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 128.859375, "completions/mean_terminated_length": 128.859375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.2473347547974414, "grad_norm": 1.760460376739502, "kl": 2.57421875, "learning_rate": 1.3444191319667425e-05, "loss": 0.093, "num_tokens": 34715684.0, "reward": -1.5252685546875, "reward_std": 0.989057183265686, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.831787109375, "rewards/ppl_reward/std": 4.075857639312744, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 124.4375, "completions/mean_terminated_length": 124.4375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.2485531526043254, "grad_norm": 1.5260179042816162, "kl": 4.1220703125, "learning_rate": 1.3436197179652627e-05, "loss": 0.147, "num_tokens": 34730000.0, "reward": -1.208984375, "reward_std": 1.0145890712738037, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.98828125, "rewards/ppl_reward/std": 3.4838685989379883, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2257249802350998, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 121.65625, "completions/mean_terminated_length": 121.65625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.2497715504112095, "grad_norm": 2.61787748336792, "kl": 2.751953125, "learning_rate": 1.3428200548916931e-05, "loss": 0.0749, "num_tokens": 34743866.0, "reward": -1.32391357421875, "reward_std": 1.0660244226455688, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.3197021484375, "rewards/ppl_reward/std": 3.6744041442871094, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 134.765625, "completions/mean_terminated_length": 134.765625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.250989948218093, "grad_norm": 1.9353207349777222, "kl": 3.7890625, "learning_rate": 1.342020143325669e-05, "loss": 0.1132, "num_tokens": 34759475.0, "reward": -5.6021728515625, "reward_std": 1.1882960796356201, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -14.829345703125, "rewards/ppl_reward/std": 26.208364486694336, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 136.875, "completions/mean_terminated_length": 136.875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.252208346024977, "grad_norm": 1.6870542764663696, "kl": 3.39453125, "learning_rate": 1.3412199838470036e-05, "loss": 0.1252, "num_tokens": 34775483.0, "reward": -1.1864013671875, "reward_std": 0.7054295539855957, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.122802734375, "rewards/ppl_reward/std": 3.272496223449707, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 148.40625, "completions/mean_terminated_length": 148.40625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 2.253426743831861, "grad_norm": 2.0672898292541504, "kl": 4.7060546875, "learning_rate": 1.3404195770356919e-05, "loss": 0.1329, "num_tokens": 34792565.0, "reward": -1.9912109375, "reward_std": 1.133125901222229, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.693359375, "rewards/ppl_reward/std": 4.972702503204346, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 134.234375, "completions/mean_terminated_length": 134.234375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.254645141638745, "grad_norm": 1.8355611562728882, "kl": 3.421875, "learning_rate": 1.3396189234719066e-05, "loss": 0.1533, "num_tokens": 34807732.0, "reward": -0.7359619140625, "reward_std": 0.3389843702316284, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.292236328125, "rewards/ppl_reward/std": 2.020827054977417, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 142.640625, "completions/mean_terminated_length": 142.640625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.2558635394456292, "grad_norm": 1.2728224992752075, "kl": 2.3486328125, "learning_rate": 1.338818023736e-05, "loss": -0.0292, "num_tokens": 34824717.0, "reward": -1.4813232421875, "reward_std": 0.5680491924285889, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.665771484375, "rewards/ppl_reward/std": 5.328316688537598, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 143.046875, "completions/mean_terminated_length": 143.046875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.257081937252513, "grad_norm": 1.970638632774353, "kl": 5.580078125, "learning_rate": 1.3380168784085028e-05, "loss": 0.1588, "num_tokens": 34840920.0, "reward": -1.03790283203125, "reward_std": 1.222611904144287, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.6929931640625, "rewards/ppl_reward/std": 4.23068380355835, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.12739521265029907, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 138.109375, "completions/mean_terminated_length": 138.109375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.258300335059397, "grad_norm": 2.5783159732818604, "kl": 4.09375, "learning_rate": 1.3372154880701241e-05, "loss": 0.1646, "num_tokens": 34856447.0, "reward": -1.789794921875, "reward_std": 0.7555871605873108, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.23583984375, "rewards/ppl_reward/std": 2.9696152210235596, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 149.3125, "completions/mean_terminated_length": 149.3125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.259518732866281, "grad_norm": 2.22511887550354, "kl": 3.2958984375, "learning_rate": 1.3364138533017495e-05, "loss": 0.1473, "num_tokens": 34873219.0, "reward": 0.04278564453125, "reward_std": 0.411917507648468, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -3.6956787109375, "rewards/ppl_reward/std": 1.5011082887649536, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 148.140625, "completions/mean_terminated_length": 148.140625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 2.260737130673165, "grad_norm": 2.3138339519500732, "kl": 5.8515625, "learning_rate": 1.3356119746844423e-05, "loss": 0.2574, "num_tokens": 34889588.0, "reward": -1.18310546875, "reward_std": 0.4834173321723938, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.9912109375, "rewards/ppl_reward/std": 3.812638998031616, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.125, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 146.265625, "completions/mean_terminated_length": 146.265625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.2619555284800485, "grad_norm": 1.832480549812317, "kl": 4.94921875, "learning_rate": 1.3348098527994435e-05, "loss": 0.1501, "num_tokens": 34905757.0, "reward": -2.4852294921875, "reward_std": 0.7973229885101318, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.509521484375, "rewards/ppl_reward/std": 9.875858306884766, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.15900352597236633, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 148.234375, "completions/mean_terminated_length": 148.234375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.2631739262869326, "grad_norm": 2.4475510120391846, "kl": 5.05859375, "learning_rate": 1.3340074882281688e-05, "loss": 0.2319, "num_tokens": 34921964.0, "reward": -0.33685302734375, "reward_std": 0.573634922504425, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.3533935546875, "rewards/ppl_reward/std": 1.698866844177246, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 143.96875, "completions/mean_terminated_length": 143.96875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.2643923240938166, "grad_norm": 3.2586190700531006, "kl": 8.4921875, "learning_rate": 1.3332048815522112e-05, "loss": 0.3035, "num_tokens": 34938250.0, "reward": -0.8045654296875, "reward_std": 1.0764046907424927, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -4.898193359375, "rewards/ppl_reward/std": 2.4613120555877686, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.1909000724554062, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 144.40625, "completions/mean_terminated_length": 144.40625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.2656107219007007, "grad_norm": 2.9810073375701904, "kl": 6.9765625, "learning_rate": 1.3324020333533378e-05, "loss": 0.2476, "num_tokens": 34954132.0, "reward": -0.8095703125, "reward_std": 0.8195663094520569, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.033203125, "rewards/ppl_reward/std": 2.4882190227508545, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.18729320168495178, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 142.609375, "completions/mean_terminated_length": 142.609375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 2.2668291197075847, "grad_norm": 2.742985725402832, "kl": 4.677734375, "learning_rate": 1.3315989442134926e-05, "loss": 0.1536, "num_tokens": 34970555.0, "reward": -1.33203125, "reward_std": 0.680870532989502, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.3125, "rewards/ppl_reward/std": 2.5264904499053955, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 147.296875, "completions/mean_terminated_length": 147.296875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 2.2680475175144683, "grad_norm": 1.7814213037490845, "kl": 3.0078125, "learning_rate": 1.3307956147147924e-05, "loss": 0.155, "num_tokens": 34987158.0, "reward": -1.88092041015625, "reward_std": 0.3454267382621765, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.5743408203125, "rewards/ppl_reward/std": 5.959123134613037, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 146.359375, "completions/mean_terminated_length": 146.359375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.2692659153213524, "grad_norm": 1.6732032299041748, "kl": 3.5673828125, "learning_rate": 1.3299920454395296e-05, "loss": 0.1212, "num_tokens": 35004717.0, "reward": -1.0130615234375, "reward_std": 0.42867881059646606, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.807373046875, "rewards/ppl_reward/std": 4.609462261199951, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 137.03125, "completions/mean_terminated_length": 137.03125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.2704843131282364, "grad_norm": 2.421964406967163, "kl": 4.779296875, "learning_rate": 1.3291882369701694e-05, "loss": 0.1984, "num_tokens": 35020055.0, "reward": -1.1688232421875, "reward_std": 0.9240455627441406, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.962646484375, "rewards/ppl_reward/std": 2.1867570877075195, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 129.875, "completions/mean_terminated_length": 129.875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 2.2717027109351204, "grad_norm": 3.0748403072357178, "kl": 4.4736328125, "learning_rate": 1.328384189889351e-05, "loss": 0.0883, "num_tokens": 35035119.0, "reward": -0.60498046875, "reward_std": 0.6120303869247437, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -4.7099609375, "rewards/ppl_reward/std": 1.7237749099731445, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20351573824882507, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 132.359375, "completions/mean_terminated_length": 132.359375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.272921108742004, "grad_norm": 2.9205074310302734, "kl": 4.310546875, "learning_rate": 1.3275799047798864e-05, "loss": 0.0688, "num_tokens": 35050574.0, "reward": -1.19696044921875, "reward_std": 0.8100661635398865, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.0736083984375, "rewards/ppl_reward/std": 2.8678359985351562, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1846257895231247, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 139.484375, "completions/mean_terminated_length": 139.484375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 2.274139506548888, "grad_norm": 1.6948779821395874, "kl": 2.943359375, "learning_rate": 1.32677538222476e-05, "loss": 0.0473, "num_tokens": 35066717.0, "reward": -1.467041015625, "reward_std": 0.6013606786727905, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.73876953125, "rewards/ppl_reward/std": 3.5912387371063232, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15344709157943726, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 138.828125, "completions/mean_terminated_length": 138.828125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.275357904355772, "grad_norm": 1.5558521747589111, "kl": 4.6484375, "learning_rate": 1.3259706228071286e-05, "loss": 0.1831, "num_tokens": 35083042.0, "reward": -2.1141357421875, "reward_std": 0.49069276452064514, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.837646484375, "rewards/ppl_reward/std": 4.012487888336182, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 148.515625, "completions/mean_terminated_length": 148.515625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.276576302162656, "grad_norm": 1.6444697380065918, "kl": 5.625, "learning_rate": 1.3251656271103203e-05, "loss": 0.2489, "num_tokens": 35100243.0, "reward": -2.1368408203125, "reward_std": 1.056816816329956, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.765869140625, "rewards/ppl_reward/std": 7.8149027824401855, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.22736713290214539, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 120.890625, "completions/mean_terminated_length": 120.890625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.27779469996954, "grad_norm": 2.9952800273895264, "kl": 5.25, "learning_rate": 1.324360395717835e-05, "loss": 0.1828, "num_tokens": 35114708.0, "reward": -9.09033203125, "reward_std": 6.877448081970215, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -21.7666015625, "rewards/ppl_reward/std": 39.59052658081055, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18496133387088776, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 135.03125, "completions/mean_terminated_length": 135.03125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.2790130977764242, "grad_norm": 1.9354705810546875, "kl": 6.009765625, "learning_rate": 1.3235549292133425e-05, "loss": 0.2805, "num_tokens": 35130438.0, "reward": -0.00982666015625, "reward_std": 0.6447871327400208, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -3.7618408203125, "rewards/ppl_reward/std": 2.5497379302978516, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 132.390625, "completions/mean_terminated_length": 132.390625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.280231495583308, "grad_norm": 3.2641372680664062, "kl": 10.1875, "learning_rate": 1.3227492281806845e-05, "loss": 0.469, "num_tokens": 35145535.0, "reward": -1.233642578125, "reward_std": 1.3173259496688843, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.81103515625, "rewards/ppl_reward/std": 3.5103821754455566, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2514837086200714, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 124.0625, "completions/mean_terminated_length": 124.0625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 2.281449893390192, "grad_norm": 2.401080369949341, "kl": 5.35546875, "learning_rate": 1.3219432932038712e-05, "loss": 0.1567, "num_tokens": 35160563.0, "reward": -0.3990478515625, "reward_std": 0.8404919505119324, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -4.313720703125, "rewards/ppl_reward/std": 1.6768206357955933, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 133.515625, "completions/mean_terminated_length": 133.515625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 2.282668291197076, "grad_norm": 1.8644886016845703, "kl": 4.99609375, "learning_rate": 1.321137124867083e-05, "loss": 0.2087, "num_tokens": 35175988.0, "reward": -1.6546630859375, "reward_std": 0.5435948967933655, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.973388671875, "rewards/ppl_reward/std": 7.774548053741455, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13768701255321503, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 160.078125, "completions/mean_terminated_length": 160.078125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 2.28388668900396, "grad_norm": 2.161810874938965, "kl": 7.79296875, "learning_rate": 1.3203307237546699e-05, "loss": 0.4205, "num_tokens": 35194225.0, "reward": -1.781982421875, "reward_std": 0.6563346982002258, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.20458984375, "rewards/ppl_reward/std": 4.984718322753906, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 138.6875, "completions/mean_terminated_length": 124.63492584228516, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 2.2851050868108436, "grad_norm": 4.160489082336426, "kl": 10.5703125, "learning_rate": 1.3195240904511497e-05, "loss": 0.5754, "num_tokens": 35209565.0, "reward": -2.4447021484375, "reward_std": 2.540693521499634, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -8.280029296875, "rewards/ppl_reward/std": 11.7504301071167, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2596118450164795, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 131.03125, "completions/mean_terminated_length": 131.03125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.2863234846177276, "grad_norm": 1.6905624866485596, "kl": 5.52734375, "learning_rate": 1.3187172255412097e-05, "loss": 0.1574, "num_tokens": 35225335.0, "reward": -3.4671630859375, "reward_std": 2.9318954944610596, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -10.520263671875, "rewards/ppl_reward/std": 11.650423049926758, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17354662716388702, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 149.28125, "completions/mean_terminated_length": 149.28125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 2.2875418824246117, "grad_norm": 1.7594707012176514, "kl": 4.921875, "learning_rate": 1.3179101296097035e-05, "loss": 0.1354, "num_tokens": 35244313.0, "reward": -0.53533935546875, "reward_std": 1.2872402667999268, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -4.3831787109375, "rewards/ppl_reward/std": 3.017082452774048, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2592533528804779, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 130.390625, "completions/mean_terminated_length": 130.390625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.2887602802314957, "grad_norm": 2.0311946868896484, "kl": 3.71875, "learning_rate": 1.3171028032416533e-05, "loss": 0.2356, "num_tokens": 35259682.0, "reward": -5.315673828125, "reward_std": 1.1449830532073975, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -14.31884765625, "rewards/ppl_reward/std": 23.195833206176758, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 144.65625, "completions/mean_terminated_length": 144.65625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.2899786780383797, "grad_norm": 6.14845609664917, "kl": 3.419921875, "learning_rate": 1.3162952470222488e-05, "loss": 0.1375, "num_tokens": 35276460.0, "reward": -1.5128173828125, "reward_std": 0.3920159637928009, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.916259765625, "rewards/ppl_reward/std": 5.141786098480225, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 129.359375, "completions/mean_terminated_length": 129.359375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 2.2911970758452633, "grad_norm": 1.8253086805343628, "kl": 1.8017578125, "learning_rate": 1.3154874615368444e-05, "loss": 0.0297, "num_tokens": 35292179.0, "reward": -0.3936767578125, "reward_std": 0.25612619519233704, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.670166015625, "rewards/ppl_reward/std": 1.4976656436920166, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 157.078125, "completions/mean_terminated_length": 157.078125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.2924154736521474, "grad_norm": 3.241302251815796, "kl": 3.111328125, "learning_rate": 1.3146794473709628e-05, "loss": 0.2502, "num_tokens": 35310728.0, "reward": -0.5555419921875, "reward_std": 0.4941079020500183, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.853271484375, "rewards/ppl_reward/std": 2.422854423522949, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 130.140625, "completions/mean_terminated_length": 130.140625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 2.2936338714590314, "grad_norm": 2.3535025119781494, "kl": 2.517578125, "learning_rate": 1.3138712051102908e-05, "loss": 0.0912, "num_tokens": 35326129.0, "reward": -1.118896484375, "reward_std": 1.051792025566101, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.11279296875, "rewards/ppl_reward/std": 5.657306671142578, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08768405020236969, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 124.9375, "completions/mean_terminated_length": 124.9375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.2948522692659155, "grad_norm": 1.5636202096939087, "kl": 3.361328125, "learning_rate": 1.3130627353406818e-05, "loss": 0.0848, "num_tokens": 35341045.0, "reward": -0.65191650390625, "reward_std": 0.5559849739074707, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.9678955078125, "rewards/ppl_reward/std": 2.2445414066314697, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1224314495921135, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 138.5, "completions/mean_terminated_length": 138.5, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.296070667072799, "grad_norm": 1.967392921447754, "kl": 8.46875, "learning_rate": 1.3122540386481533e-05, "loss": 0.5121, "num_tokens": 35356317.0, "reward": -2.6484375, "reward_std": 2.5035903453826904, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.9296875, "rewards/ppl_reward/std": 12.271079063415527, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1224314495921135, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 123.375, "completions/mean_terminated_length": 123.375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.297289064879683, "grad_norm": 2.318826913833618, "kl": 4.0390625, "learning_rate": 1.3114451156188876e-05, "loss": 0.1412, "num_tokens": 35370749.0, "reward": -1.665771484375, "reward_std": 1.2558081150054932, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.01123046875, "rewards/ppl_reward/std": 4.540666103363037, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1846257895231247, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 151.453125, "completions/mean_terminated_length": 151.453125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.298507462686567, "grad_norm": 3.9028358459472656, "kl": 9.65625, "learning_rate": 1.310635966839231e-05, "loss": 0.4856, "num_tokens": 35387698.0, "reward": -0.7529296875, "reward_std": 0.6549375057220459, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.091796875, "rewards/ppl_reward/std": 2.081836223602295, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18496133387088776, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 148.125, "completions/mean_terminated_length": 148.125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.299725860493451, "grad_norm": 4.624975204467773, "kl": 10.21875, "learning_rate": 1.3098265928956931e-05, "loss": 0.5022, "num_tokens": 35404050.0, "reward": -2.8681640625, "reward_std": 1.3901129961013794, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -9.322265625, "rewards/ppl_reward/std": 7.403484344482422, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17390352487564087, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 144.4375, "completions/mean_terminated_length": 144.4375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.300944258300335, "grad_norm": 1.812817096710205, "kl": 6.75390625, "learning_rate": 1.3090169943749475e-05, "loss": 0.3762, "num_tokens": 35420750.0, "reward": -0.7681884765625, "reward_std": 0.5501962900161743, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.177001953125, "rewards/ppl_reward/std": 1.6414340734481812, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16347388923168182, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 130.90625, "completions/mean_terminated_length": 130.90625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.302162656107219, "grad_norm": 2.6922731399536133, "kl": 9.640625, "learning_rate": 1.3082071718638301e-05, "loss": 0.4714, "num_tokens": 35435744.0, "reward": -1.75579833984375, "reward_std": 1.5750290155410767, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.8787841796875, "rewards/ppl_reward/std": 5.311521530151367, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.21647055447101593, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 126.3125, "completions/mean_terminated_length": 126.3125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 2.303381053914103, "grad_norm": 5.138086318969727, "kl": 6.123046875, "learning_rate": 1.3073971259493388e-05, "loss": 0.2038, "num_tokens": 35450452.0, "reward": -2.6475830078125, "reward_std": 1.9304124116897583, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.818603515625, "rewards/ppl_reward/std": 10.421392440795898, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19024936854839325, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 127.484375, "completions/mean_terminated_length": 127.484375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.304599451720987, "grad_norm": 3.222947597503662, "kl": 8.953125, "learning_rate": 1.3065868572186342e-05, "loss": 0.4444, "num_tokens": 35464859.0, "reward": -1.43798828125, "reward_std": 0.6907968521118164, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.3916015625, "rewards/ppl_reward/std": 2.6538825035095215, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 138.875, "completions/mean_terminated_length": 138.875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.305817849527871, "grad_norm": 1.9608218669891357, "kl": 6.24609375, "learning_rate": 1.3057763662590377e-05, "loss": 0.2577, "num_tokens": 35481027.0, "reward": -0.3043212890625, "reward_std": 0.6393139362335205, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.210205078125, "rewards/ppl_reward/std": 2.625725507736206, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16512493789196014, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 149.203125, "completions/mean_terminated_length": 149.203125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 2.307036247334755, "grad_norm": 2.5761842727661133, "kl": 8.0703125, "learning_rate": 1.3049656536580326e-05, "loss": 0.398, "num_tokens": 35498072.0, "reward": -1.8814697265625, "reward_std": 1.6673482656478882, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.177001953125, "rewards/ppl_reward/std": 6.717288017272949, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.24722543358802795, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 134.578125, "completions/mean_terminated_length": 134.578125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 2.3082546451416386, "grad_norm": 1.7163053750991821, "kl": 4.296875, "learning_rate": 1.3041547200032618e-05, "loss": 0.1586, "num_tokens": 35513669.0, "reward": -1.18505859375, "reward_std": 0.5199904441833496, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.0576171875, "rewards/ppl_reward/std": 2.3783814907073975, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17747680842876434, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 126.203125, "completions/mean_terminated_length": 126.203125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.3094730429485226, "grad_norm": 1.8474422693252563, "kl": 6.8828125, "learning_rate": 1.3033435658825293e-05, "loss": 0.2897, "num_tokens": 35528322.0, "reward": -2.2255859375, "reward_std": 1.3143192529678345, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.982421875, "rewards/ppl_reward/std": 6.804908275604248, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.23091863095760345, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 146.265625, "completions/mean_terminated_length": 146.265625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 2.3106914407554067, "grad_norm": 1.9304471015930176, "kl": 4.12890625, "learning_rate": 1.3025321918837985e-05, "loss": 0.1374, "num_tokens": 35545843.0, "reward": -0.2655029296875, "reward_std": 0.5588718056678772, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.234130859375, "rewards/ppl_reward/std": 2.423015594482422, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 135.328125, "completions/mean_terminated_length": 135.328125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.3119098385622907, "grad_norm": 2.1347742080688477, "kl": 5.18359375, "learning_rate": 1.3017205985951926e-05, "loss": 0.196, "num_tokens": 35561216.0, "reward": -0.7677001953125, "reward_std": 0.5994015336036682, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.191650390625, "rewards/ppl_reward/std": 2.153956174850464, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 140.875, "completions/mean_terminated_length": 140.875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.3131282363691748, "grad_norm": 1.5415401458740234, "kl": 3.150390625, "learning_rate": 1.300908786604993e-05, "loss": 0.0731, "num_tokens": 35577656.0, "reward": -0.08203125, "reward_std": 0.4430427551269531, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -3.8515625, "rewards/ppl_reward/std": 0.8156465888023376, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 139.8125, "completions/mean_terminated_length": 125.77778625488281, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.3143466341760583, "grad_norm": 3.7856640815734863, "kl": 6.34375, "learning_rate": 1.30009675650164e-05, "loss": 0.4196, "num_tokens": 35593340.0, "reward": -0.8592529296875, "reward_std": 0.8748065829277039, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.429443359375, "rewards/ppl_reward/std": 3.805133819580078, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.20009763538837433, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 142.015625, "completions/mean_terminated_length": 142.015625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 2.3155650319829424, "grad_norm": 1.854827880859375, "kl": 3.951171875, "learning_rate": 1.2992845088737323e-05, "loss": 0.0487, "num_tokens": 35609509.0, "reward": -1.7625732421875, "reward_std": 1.1362669467926025, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.181396484375, "rewards/ppl_reward/std": 5.449355125427246, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.208927720785141, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 134.5, "completions/mean_terminated_length": 134.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.3167834297898264, "grad_norm": 1.4000036716461182, "kl": 3.1875, "learning_rate": 1.2984720443100261e-05, "loss": 0.1186, "num_tokens": 35625021.0, "reward": -0.1202392578125, "reward_std": 0.44712409377098083, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -3.967041015625, "rewards/ppl_reward/std": 1.650773286819458, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 136.203125, "completions/mean_terminated_length": 136.203125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 2.3180018275967105, "grad_norm": 2.0686984062194824, "kl": 7.48828125, "learning_rate": 1.2976593633994347e-05, "loss": 0.3016, "num_tokens": 35641058.0, "reward": -1.1800537109375, "reward_std": 0.9406575560569763, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.711669921875, "rewards/ppl_reward/std": 3.384646415710449, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2348787635564804, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 129.828125, "completions/mean_terminated_length": 129.828125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 2.319220225403594, "grad_norm": 2.0058095455169678, "kl": 3.3681640625, "learning_rate": 1.296846466731028e-05, "loss": 0.0655, "num_tokens": 35655815.0, "reward": -1.806640625, "reward_std": 0.774692714214325, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.29296875, "rewards/ppl_reward/std": 2.0500051975250244, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 151.140625, "completions/mean_terminated_length": 151.140625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 2.320438623210478, "grad_norm": 2.6262285709381104, "kl": 3.7255859375, "learning_rate": 1.2960333548940334e-05, "loss": 0.1927, "num_tokens": 35672528.0, "reward": -0.5887451171875, "reward_std": 0.547622799873352, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.989990234375, "rewards/ppl_reward/std": 1.6094999313354492, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 143.625, "completions/mean_terminated_length": 143.625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 2.321657021017362, "grad_norm": 1.3998968601226807, "kl": 3.220703125, "learning_rate": 1.2952200284778323e-05, "loss": 0.1197, "num_tokens": 35689064.0, "reward": -0.85211181640625, "reward_std": 1.0359128713607788, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.5167236328125, "rewards/ppl_reward/std": 6.040870666503906, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 142.015625, "completions/mean_terminated_length": 142.015625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.322875418824246, "grad_norm": 1.402223825454712, "kl": 4.8876953125, "learning_rate": 1.2944064880719634e-05, "loss": 0.1589, "num_tokens": 35705449.0, "reward": -1.0498046875, "reward_std": 0.9250521063804626, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.787109375, "rewards/ppl_reward/std": 4.0164337158203125, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 132.671875, "completions/mean_terminated_length": 132.671875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 2.3240938166311302, "grad_norm": 1.7128725051879883, "kl": 4.25, "learning_rate": 1.2935927342661204e-05, "loss": 0.0439, "num_tokens": 35720844.0, "reward": -1.099853515625, "reward_std": 1.2209748029708862, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.81689453125, "rewards/ppl_reward/std": 2.677321434020996, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.25219154357910156, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 133.09375, "completions/mean_terminated_length": 133.09375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 2.325312214438014, "grad_norm": 2.1926727294921875, "kl": 4.62890625, "learning_rate": 1.29277876765015e-05, "loss": 0.0539, "num_tokens": 35735890.0, "reward": -1.2132568359375, "reward_std": 0.9763815402984619, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.996826171875, "rewards/ppl_reward/std": 3.3855738639831543, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18225978314876556, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 144.25, "completions/mean_terminated_length": 144.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.326530612244898, "grad_norm": 1.7841274738311768, "kl": 4.77734375, "learning_rate": 1.2919645888140551e-05, "loss": 0.1665, "num_tokens": 35752314.0, "reward": -1.6522216796875, "reward_std": 0.6146810054779053, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.890380859375, "rewards/ppl_reward/std": 2.3557817935943604, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17951758205890656, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 144.71875, "completions/mean_terminated_length": 144.71875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.327749010051782, "grad_norm": 2.415440320968628, "kl": 5.43359375, "learning_rate": 1.2911501983479915e-05, "loss": 0.1962, "num_tokens": 35768456.0, "reward": -2.82666015625, "reward_std": 1.2390609979629517, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -9.3720703125, "rewards/ppl_reward/std": 9.188924789428711, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 135.96875, "completions/mean_terminated_length": 135.96875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.328967407858666, "grad_norm": 2.2692413330078125, "kl": 2.95703125, "learning_rate": 1.2903355968422688e-05, "loss": 0.0112, "num_tokens": 35783766.0, "reward": -3.2474365234375, "reward_std": 2.2407896518707275, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -10.252685546875, "rewards/ppl_reward/std": 14.398487091064453, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 162.421875, "completions/mean_terminated_length": 148.74603271484375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.3301858056655496, "grad_norm": 5.786649703979492, "kl": 6.6328125, "learning_rate": 1.2895207848873488e-05, "loss": 0.2005, "num_tokens": 35801113.0, "reward": -3.1246337890625, "reward_std": 1.6086324453353882, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -9.858642578125, "rewards/ppl_reward/std": 8.766999244689941, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 144.046875, "completions/mean_terminated_length": 144.046875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.3314042034724336, "grad_norm": 3.8024067878723145, "kl": 6.94140625, "learning_rate": 1.2887057630738466e-05, "loss": 0.2773, "num_tokens": 35816732.0, "reward": -1.1317138671875, "reward_std": 1.4947161674499512, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.810302734375, "rewards/ppl_reward/std": 4.111734390258789, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21114179491996765, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 2.3326226012793176, "grad_norm": 2.1681485176086426, "kl": 5.90625, "learning_rate": 1.2878905319925296e-05, "loss": 0.2037, "num_tokens": 35834164.0, "reward": -4.6927490234375, "reward_std": 1.0607157945632935, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -13.002685546875, "rewards/ppl_reward/std": 23.094276428222656, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 154.6875, "completions/mean_terminated_length": 154.6875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 2.3338409990862017, "grad_norm": 1.4006671905517578, "kl": 5.00390625, "learning_rate": 1.287075092234316e-05, "loss": 0.1808, "num_tokens": 35851712.0, "reward": -0.595703125, "reward_std": 0.6566905379295349, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.82421875, "rewards/ppl_reward/std": 2.4198381900787354, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.17102740705013275, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 152.21875, "completions/mean_terminated_length": 152.21875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 2.3350593968930857, "grad_norm": 1.9795732498168945, "kl": 3.890625, "learning_rate": 1.2862594443902759e-05, "loss": 0.0749, "num_tokens": 35868654.0, "reward": -1.4517822265625, "reward_std": 0.6168465614318848, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.520751953125, "rewards/ppl_reward/std": 3.3945436477661133, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21474508941173553, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 155.375, "completions/mean_terminated_length": 155.375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.3362777946999698, "grad_norm": 2.1384239196777344, "kl": 4.95703125, "learning_rate": 1.2854435890516301e-05, "loss": 0.0857, "num_tokens": 35885614.0, "reward": -2.2047119140625, "reward_std": 1.5636223554611206, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.901611328125, "rewards/ppl_reward/std": 6.044312000274658, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2229611724615097, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 160.859375, "completions/mean_terminated_length": 160.859375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 2.3374961925068534, "grad_norm": 1.94896399974823, "kl": 4.56640625, "learning_rate": 1.2846275268097494e-05, "loss": 0.1417, "num_tokens": 35903101.0, "reward": -0.9122314453125, "reward_std": 0.6921175718307495, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.449462890625, "rewards/ppl_reward/std": 2.268890142440796, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14689241349697113, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 161.5625, "completions/mean_terminated_length": 161.5625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 2.3387145903137374, "grad_norm": 3.3695485591888428, "kl": 4.0693359375, "learning_rate": 1.2838112582561554e-05, "loss": 0.2909, "num_tokens": 35920065.0, "reward": -0.4417724609375, "reward_std": 0.6662670373916626, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.625732421875, "rewards/ppl_reward/std": 2.592061996459961, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 162.078125, "completions/mean_terminated_length": 162.078125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.3399329881206214, "grad_norm": 1.5335179567337036, "kl": 3.296875, "learning_rate": 1.2829947839825189e-05, "loss": 0.1106, "num_tokens": 35937646.0, "reward": -0.647216796875, "reward_std": 0.8752574920654297, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.07568359375, "rewards/ppl_reward/std": 4.016256332397461, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 152.78125, "completions/mean_terminated_length": 152.78125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.3411513859275055, "grad_norm": 1.3948431015014648, "kl": 2.6875, "learning_rate": 1.2821781045806591e-05, "loss": 0.0259, "num_tokens": 35954432.0, "reward": -0.72705078125, "reward_std": 0.47675129771232605, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.2119140625, "rewards/ppl_reward/std": 2.948603868484497, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 167.671875, "completions/mean_terminated_length": 167.671875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.342369783734389, "grad_norm": 1.330277442932129, "kl": 4.6005859375, "learning_rate": 1.281361220642545e-05, "loss": 0.1699, "num_tokens": 35972683.0, "reward": -0.744384765625, "reward_std": 0.784853458404541, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.13720703125, "rewards/ppl_reward/std": 2.7975385189056396, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 151.515625, "completions/mean_terminated_length": 151.515625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.343588181541273, "grad_norm": 1.5339261293411255, "kl": 2.9853515625, "learning_rate": 1.280544132760293e-05, "loss": 0.1117, "num_tokens": 35989020.0, "reward": -1.6318359375, "reward_std": 0.5326039791107178, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.123046875, "rewards/ppl_reward/std": 6.834725856781006, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11545931547880173, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 163.578125, "completions/mean_terminated_length": 163.578125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 2.344806579348157, "grad_norm": 1.6174296140670776, "kl": 4.2685546875, "learning_rate": 1.2797268415261681e-05, "loss": 0.1506, "num_tokens": 36007177.0, "reward": -0.4278564453125, "reward_std": 0.8297938108444214, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.527587890625, "rewards/ppl_reward/std": 2.5715484619140625, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 157.78125, "completions/mean_terminated_length": 157.78125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 2.346024977155041, "grad_norm": 1.1636627912521362, "kl": 3.09765625, "learning_rate": 1.2789093475325818e-05, "loss": 0.0873, "num_tokens": 36024267.0, "reward": -1.22674560546875, "reward_std": 1.0776300430297852, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.0941162109375, "rewards/ppl_reward/std": 3.828639268875122, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 162.28125, "completions/mean_terminated_length": 162.28125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.3472433749619253, "grad_norm": 1.4190536737442017, "kl": 2.84765625, "learning_rate": 1.2780916513720934e-05, "loss": 0.044, "num_tokens": 36042541.0, "reward": -2.595947265625, "reward_std": 0.772830605506897, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.97314453125, "rewards/ppl_reward/std": 5.241528034210205, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 145.390625, "completions/mean_terminated_length": 145.390625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 2.348461772768809, "grad_norm": 2.399883985519409, "kl": 4.876953125, "learning_rate": 1.277273753637408e-05, "loss": 0.1267, "num_tokens": 36058214.0, "reward": -1.34649658203125, "reward_std": 1.5153038501739502, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.3961181640625, "rewards/ppl_reward/std": 7.34960412979126, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15570341050624847, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 162.46875, "completions/mean_terminated_length": 162.46875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 2.349680170575693, "grad_norm": 1.7055160999298096, "kl": 3.6416015625, "learning_rate": 1.2764556549213775e-05, "loss": 0.1206, "num_tokens": 36075932.0, "reward": -0.8221435546875, "reward_std": 0.3509364128112793, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.402099609375, "rewards/ppl_reward/std": 2.429306983947754, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14471296966075897, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 150.703125, "completions/mean_terminated_length": 150.703125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.350898568382577, "grad_norm": 3.0395941734313965, "kl": 5.078125, "learning_rate": 1.2756373558169992e-05, "loss": 0.1637, "num_tokens": 36092825.0, "reward": -2.3720703125, "reward_std": 0.5481091737747192, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.392578125, "rewards/ppl_reward/std": 4.653595447540283, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15545432269573212, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 170.96875, "completions/mean_terminated_length": 170.96875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.352116966189461, "grad_norm": 2.8961329460144043, "kl": 8.4921875, "learning_rate": 1.2748188569174156e-05, "loss": 0.4375, "num_tokens": 36111143.0, "reward": -0.88232421875, "reward_std": 0.9653245806694031, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.3115234375, "rewards/ppl_reward/std": 2.524759292602539, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.18617255985736847, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 131.0, "completions/mean_terminated_length": 131.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 2.3533353639963446, "grad_norm": 1.6820785999298096, "kl": 3.3984375, "learning_rate": 1.2740001588159139e-05, "loss": -0.0068, "num_tokens": 36126063.0, "reward": -2.501220703125, "reward_std": 0.9968632459640503, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.61181640625, "rewards/ppl_reward/std": 5.776340484619141, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21114179491996765, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 155.859375, "completions/mean_terminated_length": 155.859375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 2.3545537618032286, "grad_norm": 2.0842888355255127, "kl": 4.80859375, "learning_rate": 1.2731812621059262e-05, "loss": 0.0898, "num_tokens": 36143262.0, "reward": -3.31982421875, "reward_std": 2.8158023357391357, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -10.2021484375, "rewards/ppl_reward/std": 16.326385498046875, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17817416787147522, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 141.921875, "completions/mean_terminated_length": 141.921875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 2.3557721596101127, "grad_norm": 1.611946940422058, "kl": 4.9765625, "learning_rate": 1.2723621673810277e-05, "loss": 0.1979, "num_tokens": 36158865.0, "reward": -4.030517578125, "reward_std": 1.50057053565979, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -11.63916015625, "rewards/ppl_reward/std": 12.533527374267578, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 150.765625, "completions/mean_terminated_length": 150.765625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.3569905574169967, "grad_norm": 2.843146800994873, "kl": 4.681640625, "learning_rate": 1.2715428752349377e-05, "loss": 0.1567, "num_tokens": 36175202.0, "reward": -2.1162109375, "reward_std": 1.1946526765823364, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.927734375, "rewards/ppl_reward/std": 4.148251056671143, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1979166716337204, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 146.03125, "completions/mean_terminated_length": 146.03125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.3582089552238807, "grad_norm": 1.5372309684753418, "kl": 2.35546875, "learning_rate": 1.2707233862615187e-05, "loss": 0.0121, "num_tokens": 36191756.0, "reward": -1.5537109375, "reward_std": 0.40071576833724976, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.912109375, "rewards/ppl_reward/std": 4.414355278015137, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 136.578125, "completions/mean_terminated_length": 136.578125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 2.359427353030765, "grad_norm": 1.334938645362854, "kl": 2.646484375, "learning_rate": 1.2699037010547759e-05, "loss": -0.0187, "num_tokens": 36207961.0, "reward": -0.91650390625, "reward_std": 1.0937514305114746, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.4423828125, "rewards/ppl_reward/std": 4.797952651977539, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21578919887542725, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 141.9375, "completions/mean_terminated_length": 141.9375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.3606457508376484, "grad_norm": 1.447295069694519, "kl": 3.70703125, "learning_rate": 1.2690838202088562e-05, "loss": 0.1038, "num_tokens": 36223493.0, "reward": -1.2103271484375, "reward_std": 0.706536591053009, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.178466796875, "rewards/ppl_reward/std": 2.8513123989105225, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 146.53125, "completions/mean_terminated_length": 146.53125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.3618641486445324, "grad_norm": 3.0406076908111572, "kl": 4.56640625, "learning_rate": 1.2682637443180485e-05, "loss": 0.1742, "num_tokens": 36239543.0, "reward": -4.40423583984375, "reward_std": 1.9469482898712158, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -12.3709716796875, "rewards/ppl_reward/std": 14.521893501281738, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.15430335700511932, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 152.796875, "completions/mean_terminated_length": 152.796875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 2.3630825464514165, "grad_norm": 1.6256370544433594, "kl": 2.4921875, "learning_rate": 1.267443473976784e-05, "loss": 0.0011, "num_tokens": 36256802.0, "reward": -1.0025634765625, "reward_std": 0.9374033808708191, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.700439453125, "rewards/ppl_reward/std": 3.485102415084839, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.21704266965389252, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 162.890625, "completions/mean_terminated_length": 162.890625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.3643009442583005, "grad_norm": 1.9960795640945435, "kl": 3.984375, "learning_rate": 1.2666230097796333e-05, "loss": 0.0752, "num_tokens": 36274355.0, "reward": -0.9140625, "reward_std": 0.7112466096878052, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.578125, "rewards/ppl_reward/std": 3.111908435821533, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 146.984375, "completions/mean_terminated_length": 146.984375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 2.365519342065184, "grad_norm": 1.6821227073669434, "kl": 3.111328125, "learning_rate": 1.2658023523213086e-05, "loss": 0.1446, "num_tokens": 36290882.0, "reward": -2.319091796875, "reward_std": 0.4399973750114441, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -8.52099609375, "rewards/ppl_reward/std": 9.585923194885254, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 144.765625, "completions/mean_terminated_length": 144.765625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 2.366737739872068, "grad_norm": 1.2255773544311523, "kl": 1.4609375, "learning_rate": 1.264981502196662e-05, "loss": -0.0469, "num_tokens": 36306379.0, "reward": -4.9656982421875, "reward_std": 0.8297780156135559, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -13.751708984375, "rewards/ppl_reward/std": 20.96424674987793, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 158.984375, "completions/mean_terminated_length": 158.984375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.367956137678952, "grad_norm": 1.549441933631897, "kl": 3.208984375, "learning_rate": 1.2641604600006847e-05, "loss": 0.155, "num_tokens": 36323738.0, "reward": -1.4503173828125, "reward_std": 0.48482105135917664, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.744384765625, "rewards/ppl_reward/std": 3.8105199337005615, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 160.46875, "completions/mean_terminated_length": 160.46875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 2.3691745354858362, "grad_norm": 1.1933789253234863, "kl": 1.2783203125, "learning_rate": 1.2633392263285079e-05, "loss": 0.0301, "num_tokens": 36340872.0, "reward": -1.0294189453125, "reward_std": 0.3016652762889862, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.972900390625, "rewards/ppl_reward/std": 3.267390251159668, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1550549566745758, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 162.296875, "completions/mean_terminated_length": 162.296875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.3703929332927203, "grad_norm": 2.0062503814697266, "kl": 1.9814453125, "learning_rate": 1.2625178017754011e-05, "loss": 0.1027, "num_tokens": 36358899.0, "reward": -0.783203125, "reward_std": 0.42569229006767273, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.44921875, "rewards/ppl_reward/std": 2.0400032997131348, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 157.90625, "completions/mean_terminated_length": 157.90625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.371611331099604, "grad_norm": 1.6127582788467407, "kl": 3.646484375, "learning_rate": 1.2616961869367725e-05, "loss": 0.1236, "num_tokens": 36376893.0, "reward": -0.6510009765625, "reward_std": 0.5759194493293762, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.059814453125, "rewards/ppl_reward/std": 4.797634601593018, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 144.921875, "completions/mean_terminated_length": 144.921875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.372829728906488, "grad_norm": 1.3917489051818848, "kl": 3.5625, "learning_rate": 1.260874382408168e-05, "loss": 0.1189, "num_tokens": 36392904.0, "reward": -1.169677734375, "reward_std": 0.8595473766326904, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.05029296875, "rewards/ppl_reward/std": 2.9609711170196533, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11672777682542801, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 142.609375, "completions/mean_terminated_length": 142.609375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.374048126713372, "grad_norm": 1.5601112842559814, "kl": 2.564453125, "learning_rate": 1.2600523887852707e-05, "loss": 0.0674, "num_tokens": 36409639.0, "reward": -1.2606201171875, "reward_std": 0.6035269498825073, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.310302734375, "rewards/ppl_reward/std": 3.3695106506347656, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 161.109375, "completions/mean_terminated_length": 161.109375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 2.375266524520256, "grad_norm": 2.616281270980835, "kl": 7.421875, "learning_rate": 1.2592302066639013e-05, "loss": 0.3223, "num_tokens": 36426910.0, "reward": -0.57275390625, "reward_std": 0.706311821937561, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.7705078125, "rewards/ppl_reward/std": 2.0763254165649414, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 145.46875, "completions/mean_terminated_length": 145.46875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.3764849223271396, "grad_norm": 2.7770228385925293, "kl": 7.26953125, "learning_rate": 1.258407836640017e-05, "loss": 0.4298, "num_tokens": 36443444.0, "reward": -3.08251953125, "reward_std": 0.46824684739112854, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -9.9775390625, "rewards/ppl_reward/std": 12.837872505187988, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 144.203125, "completions/mean_terminated_length": 144.203125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 2.3777033201340236, "grad_norm": 1.6037989854812622, "kl": 3.95703125, "learning_rate": 1.2575852793097113e-05, "loss": 0.108, "num_tokens": 36460001.0, "reward": -2.068115234375, "reward_std": 1.9752014875411987, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.87060546875, "rewards/ppl_reward/std": 7.530940055847168, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16194961965084076, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 163.515625, "completions/mean_terminated_length": 163.515625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 2.3789217179409077, "grad_norm": 3.3735060691833496, "kl": 5.1162109375, "learning_rate": 1.2567625352692127e-05, "loss": 0.1462, "num_tokens": 36478258.0, "reward": -0.8575439453125, "reward_std": 0.3221926987171173, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.566650390625, "rewards/ppl_reward/std": 3.4802372455596924, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 130.546875, "completions/mean_terminated_length": 130.546875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.3801401157477917, "grad_norm": 2.0634303092956543, "kl": 5.08203125, "learning_rate": 1.255939605114886e-05, "loss": 0.1578, "num_tokens": 36493133.0, "reward": -1.5601806640625, "reward_std": 0.519132673740387, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.885986328125, "rewards/ppl_reward/std": 4.389716148376465, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 143.859375, "completions/mean_terminated_length": 143.859375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 2.3813585135546758, "grad_norm": 2.118499517440796, "kl": 4.39453125, "learning_rate": 1.2551164894432305e-05, "loss": 0.1456, "num_tokens": 36510932.0, "reward": -0.07177734375, "reward_std": 0.5341969728469849, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -3.8466796875, "rewards/ppl_reward/std": 1.7142316102981567, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1423913538455963, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 135.890625, "completions/mean_terminated_length": 135.890625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.3825769113615594, "grad_norm": 3.6678202152252197, "kl": 7.6953125, "learning_rate": 1.2542931888508804e-05, "loss": 0.314, "num_tokens": 36526397.0, "reward": -2.41162109375, "reward_std": 1.125171422958374, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.3466796875, "rewards/ppl_reward/std": 7.568170547485352, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18496133387088776, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 131.703125, "completions/mean_terminated_length": 131.703125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.3837953091684434, "grad_norm": 2.764477252960205, "kl": 4.693359375, "learning_rate": 1.253469703934603e-05, "loss": 0.1638, "num_tokens": 36541906.0, "reward": -1.1534423828125, "reward_std": 0.5397141575813293, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.955322265625, "rewards/ppl_reward/std": 2.442718505859375, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1489359587430954, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 125.6875, "completions/mean_terminated_length": 125.6875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 2.3850137069753274, "grad_norm": 1.5112859010696411, "kl": 3.109375, "learning_rate": 1.2526460352912994e-05, "loss": 0.0238, "num_tokens": 36556262.0, "reward": -0.7864990234375, "reward_std": 0.5676048994064331, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.322998046875, "rewards/ppl_reward/std": 2.8352558612823486, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 130.3125, "completions/mean_terminated_length": 130.3125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.3862321047822115, "grad_norm": 1.8533762693405151, "kl": 4.17578125, "learning_rate": 1.2518221835180053e-05, "loss": 0.1564, "num_tokens": 36571186.0, "reward": -0.8524169921875, "reward_std": 1.087700605392456, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.345458984375, "rewards/ppl_reward/std": 4.3347039222717285, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 124.84375, "completions/mean_terminated_length": 124.84375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 2.3874505025890955, "grad_norm": 2.9936256408691406, "kl": 1.8212890625, "learning_rate": 1.2509981492118875e-05, "loss": 0.0173, "num_tokens": 36585488.0, "reward": -1.0941162109375, "reward_std": 0.6100156307220459, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.047607421875, "rewards/ppl_reward/std": 2.9101030826568604, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11545931547880173, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 131.140625, "completions/mean_terminated_length": 131.140625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 2.388668900395979, "grad_norm": 2.102086305618286, "kl": 5.3046875, "learning_rate": 1.2501739329702453e-05, "loss": 0.1677, "num_tokens": 36600865.0, "reward": -0.815185546875, "reward_std": 0.7164061069488525, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.24755859375, "rewards/ppl_reward/std": 1.8590013980865479, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21931616961956024, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 124.875, "completions/mean_terminated_length": 124.875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 2.389887298202863, "grad_norm": 3.248950242996216, "kl": 5.0859375, "learning_rate": 1.2493495353905112e-05, "loss": 0.2306, "num_tokens": 36615281.0, "reward": -2.71905517578125, "reward_std": 0.632122278213501, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -9.1334228515625, "rewards/ppl_reward/std": 7.182764530181885, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14471296966075897, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 139.21875, "completions/mean_terminated_length": 139.21875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.391105696009747, "grad_norm": 2.5662472248077393, "kl": 7.0078125, "learning_rate": 1.2485249570702471e-05, "loss": 0.3364, "num_tokens": 36631447.0, "reward": -1.16064453125, "reward_std": 1.1694506406784058, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.9462890625, "rewards/ppl_reward/std": 3.4644484519958496, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.2182178944349289, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 139.328125, "completions/mean_terminated_length": 139.328125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 2.3923240938166312, "grad_norm": 1.5430009365081787, "kl": 3.2109375, "learning_rate": 1.2477001986071478e-05, "loss": 0.1196, "num_tokens": 36647452.0, "reward": -1.712646484375, "reward_std": 0.45810064673423767, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.20654296875, "rewards/ppl_reward/std": 5.3546953201293945, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 128.640625, "completions/mean_terminated_length": 128.640625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.3935424916235153, "grad_norm": 1.9886666536331177, "kl": 5.03125, "learning_rate": 1.2468752605990378e-05, "loss": 0.1842, "num_tokens": 36662485.0, "reward": -0.00653076171875, "reward_std": 0.47456979751586914, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -3.7161865234375, "rewards/ppl_reward/std": 1.5059412717819214, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 121.328125, "completions/mean_terminated_length": 121.328125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.394760889430399, "grad_norm": 1.6653012037277222, "kl": 1.9951171875, "learning_rate": 1.2460501436438715e-05, "loss": -0.0238, "num_tokens": 36676754.0, "reward": -3.3748779296875, "reward_std": 0.5007771849632263, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -10.593505859375, "rewards/ppl_reward/std": 8.364954948425293, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 122.015625, "completions/mean_terminated_length": 122.015625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.395979287237283, "grad_norm": 2.288278102874756, "kl": 4.68359375, "learning_rate": 1.2452248483397331e-05, "loss": 0.1534, "num_tokens": 36691107.0, "reward": -2.400146484375, "reward_std": 0.8903687000274658, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -8.30810546875, "rewards/ppl_reward/std": 5.213621616363525, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1526367962360382, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 125.40625, "completions/mean_terminated_length": 125.40625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.397197685044167, "grad_norm": 2.133516311645508, "kl": 5.52734375, "learning_rate": 1.2443993752848371e-05, "loss": 0.217, "num_tokens": 36705941.0, "reward": -0.12347412109375, "reward_std": 0.5744995474815369, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -3.8875732421875, "rewards/ppl_reward/std": 1.5841810703277588, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 120.890625, "completions/mean_terminated_length": 120.890625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.398416082851051, "grad_norm": 2.759686231613159, "kl": 4.12890625, "learning_rate": 1.2435737250775258e-05, "loss": 0.1447, "num_tokens": 36720558.0, "reward": -0.9068603515625, "reward_std": 0.4193063974380493, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.579345703125, "rewards/ppl_reward/std": 2.9924302101135254, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 149.765625, "completions/mean_terminated_length": 149.765625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.3996344806579346, "grad_norm": 2.3978734016418457, "kl": 5.2421875, "learning_rate": 1.2427478983162694e-05, "loss": 0.2793, "num_tokens": 36738503.0, "reward": -0.5621337890625, "reward_std": 0.4483106732368469, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.874267578125, "rewards/ppl_reward/std": 2.8238532543182373, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 126.296875, "completions/mean_terminated_length": 126.296875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.4008528784648187, "grad_norm": 3.2227725982666016, "kl": 5.66796875, "learning_rate": 1.2419218955996677e-05, "loss": 0.236, "num_tokens": 36753314.0, "reward": -4.8873291015625, "reward_std": 1.1772749423980713, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -13.391845703125, "rewards/ppl_reward/std": 15.738158226013184, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 117.296875, "completions/mean_terminated_length": 117.296875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.4020712762717027, "grad_norm": 6.115329742431641, "kl": 9.16796875, "learning_rate": 1.241095717526447e-05, "loss": 0.3779, "num_tokens": 36767309.0, "reward": -1.89111328125, "reward_std": 1.2193140983581543, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -7.1181640625, "rewards/ppl_reward/std": 5.0267653465271, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2043897658586502, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 121.28125, "completions/mean_terminated_length": 121.28125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.4032896740785867, "grad_norm": 4.17263126373291, "kl": 8.875, "learning_rate": 1.2402693646954607e-05, "loss": 0.3646, "num_tokens": 36782647.0, "reward": -1.358154296875, "reward_std": 1.0436298847198486, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -6.09130859375, "rewards/ppl_reward/std": 4.823936939239502, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.15430335700511932, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 140.890625, "completions/mean_terminated_length": 140.890625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 2.4045080718854708, "grad_norm": 2.2567663192749023, "kl": 6.6328125, "learning_rate": 1.2394428377056894e-05, "loss": 0.3384, "num_tokens": 36799088.0, "reward": -2.34228515625, "reward_std": 0.6497644186019897, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.3486328125, "rewards/ppl_reward/std": 4.6593523025512695, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 130.65625, "completions/mean_terminated_length": 130.65625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 2.4057264696923544, "grad_norm": 1.835274577140808, "kl": 4.421875, "learning_rate": 1.238616137156239e-05, "loss": 0.1789, "num_tokens": 36814658.0, "reward": -0.7310791015625, "reward_std": 0.5200255513191223, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.141845703125, "rewards/ppl_reward/std": 2.7701120376586914, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 139.40625, "completions/mean_terminated_length": 139.40625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.4069448674992384, "grad_norm": 2.0612258911132812, "kl": 7.306640625, "learning_rate": 1.2377892636463422e-05, "loss": 0.362, "num_tokens": 36831564.0, "reward": -0.3433837890625, "reward_std": 0.7334918975830078, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.272705078125, "rewards/ppl_reward/std": 2.050736427307129, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1846257895231247, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 114.328125, "completions/mean_terminated_length": 114.328125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 2.4081632653061225, "grad_norm": 1.7436732053756714, "kl": 5.15234375, "learning_rate": 1.2369622177753567e-05, "loss": 0.2346, "num_tokens": 36845537.0, "reward": -1.0836181640625, "reward_std": 1.2506502866744995, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.596923828125, "rewards/ppl_reward/std": 4.108974456787109, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2184663861989975, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 131.203125, "completions/mean_terminated_length": 131.203125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 2.4093816631130065, "grad_norm": 2.6473896503448486, "kl": 3.46875, "learning_rate": 1.236135000142765e-05, "loss": 0.1013, "num_tokens": 36861766.0, "reward": -0.9195556640625, "reward_std": 0.593382716178894, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.440673828125, "rewards/ppl_reward/std": 2.668269395828247, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1767328530550003, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 128.9375, "completions/mean_terminated_length": 128.9375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.41060006091989, "grad_norm": 2.1823954582214355, "kl": 6.0625, "learning_rate": 1.2353076113481742e-05, "loss": 0.278, "num_tokens": 36877250.0, "reward": -2.14404296875, "reward_std": 0.7919554114341736, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.7646484375, "rewards/ppl_reward/std": 6.374290466308594, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.25898414850234985, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 119.921875, "completions/mean_terminated_length": 119.921875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.411818458726774, "grad_norm": 3.3921821117401123, "kl": 4.1171875, "learning_rate": 1.2344800519913152e-05, "loss": 0.2167, "num_tokens": 36891701.0, "reward": -1.9951171875, "reward_std": 0.9983804225921631, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.701171875, "rewards/ppl_reward/std": 5.38974142074585, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11672777682542801, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 116.296875, "completions/mean_terminated_length": 116.296875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 2.413036856533658, "grad_norm": 2.5577869415283203, "kl": 6.16015625, "learning_rate": 1.2336523226720434e-05, "loss": 0.298, "num_tokens": 36905840.0, "reward": -13.9967041015625, "reward_std": 21.715639114379883, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -31.485595703125, "rewards/ppl_reward/std": 130.30758666992188, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2184663861989975, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 115.515625, "completions/mean_terminated_length": 115.515625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 2.4142552543405422, "grad_norm": 1.7948983907699585, "kl": 4.625, "learning_rate": 1.2328244239903362e-05, "loss": 0.1433, "num_tokens": 36920521.0, "reward": -1.06097412109375, "reward_std": 0.7239501476287842, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.5906982421875, "rewards/ppl_reward/std": 4.897775173187256, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20351573824882507, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 120.125, "completions/mean_terminated_length": 120.125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 2.4154736521474263, "grad_norm": 2.3637197017669678, "kl": 5.6640625, "learning_rate": 1.2319963565462949e-05, "loss": 0.3323, "num_tokens": 36935545.0, "reward": -3.1201171875, "reward_std": 1.9857115745544434, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -9.662109375, "rewards/ppl_reward/std": 13.609549522399902, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2735668122768402, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 111.0, "completions/mean_terminated_length": 111.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.4166920499543103, "grad_norm": 1.8253952264785767, "kl": 3.7109375, "learning_rate": 1.2311681209401423e-05, "loss": 0.0758, "num_tokens": 36949601.0, "reward": -0.4937744140625, "reward_std": 0.6816595792770386, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.581298828125, "rewards/ppl_reward/std": 1.4351775646209717, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19416078925132751, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 112.4375, "completions/mean_terminated_length": 112.4375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.417910447761194, "grad_norm": 2.198939085006714, "kl": 2.8125, "learning_rate": 1.2303397177722234e-05, "loss": 0.0787, "num_tokens": 36964149.0, "reward": -2.4578857421875, "reward_std": 0.5536757707595825, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.665771484375, "rewards/ppl_reward/std": 7.344307899475098, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 120.078125, "completions/mean_terminated_length": 120.078125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.419128845568078, "grad_norm": 2.0945992469787598, "kl": 3.5078125, "learning_rate": 1.2295111476430044e-05, "loss": 0.0903, "num_tokens": 36979170.0, "reward": -0.9150390625, "reward_std": 0.5286633968353271, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.548828125, "rewards/ppl_reward/std": 2.4235033988952637, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 110.0625, "completions/mean_terminated_length": 110.0625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.420347243374962, "grad_norm": 1.4919688701629639, "kl": 3.02734375, "learning_rate": 1.2286824111530734e-05, "loss": 0.0805, "num_tokens": 36992966.0, "reward": -0.5587158203125, "reward_std": 0.5308783054351807, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.937744140625, "rewards/ppl_reward/std": 2.6436758041381836, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 122.265625, "completions/mean_terminated_length": 122.265625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.421565641181846, "grad_norm": 2.578092098236084, "kl": 6.67578125, "learning_rate": 1.2278535089031377e-05, "loss": 0.2112, "num_tokens": 37008559.0, "reward": -0.62451171875, "reward_std": 0.7176742553710938, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.9130859375, "rewards/ppl_reward/std": 2.3095510005950928, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 121.4375, "completions/mean_terminated_length": 121.4375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 2.4227840389887296, "grad_norm": 2.2727017402648926, "kl": 3.4775390625, "learning_rate": 1.227024441494026e-05, "loss": 0.1072, "num_tokens": 37023483.0, "reward": -1.1865234375, "reward_std": 0.6718852519989014, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.099609375, "rewards/ppl_reward/std": 4.303016185760498, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 114.109375, "completions/mean_terminated_length": 114.109375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.4240024367956137, "grad_norm": 1.7412985563278198, "kl": 4.3154296875, "learning_rate": 1.226195209526686e-05, "loss": 0.1026, "num_tokens": 37037330.0, "reward": -1.916259765625, "reward_std": 0.8700181245803833, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.51220703125, "rewards/ppl_reward/std": 5.1208062171936035, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11672777682542801, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 138.1875, "completions/mean_terminated_length": 124.12699127197266, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.4252208346024977, "grad_norm": 1.8890472650527954, "kl": 5.5888671875, "learning_rate": 1.2253658136021852e-05, "loss": 0.3639, "num_tokens": 37053486.0, "reward": -0.80572509765625, "reward_std": 0.30141085386276245, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.5255126953125, "rewards/ppl_reward/std": 3.517526626586914, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 124.4375, "completions/mean_terminated_length": 124.4375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.4264392324093818, "grad_norm": 1.9413503408432007, "kl": 6.31640625, "learning_rate": 1.224536254321709e-05, "loss": 0.33, "num_tokens": 37068658.0, "reward": -0.8873291015625, "reward_std": 0.5468596816062927, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.469970703125, "rewards/ppl_reward/std": 4.135261535644531, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18225978314876556, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 124.203125, "completions/mean_terminated_length": 124.203125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.427657630216266, "grad_norm": 2.732912063598633, "kl": 7.28125, "learning_rate": 1.2237065322865625e-05, "loss": 0.2591, "num_tokens": 37083519.0, "reward": -1.904052734375, "reward_std": 1.7616368532180786, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.34716796875, "rewards/ppl_reward/std": 6.4124884605407715, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.20740120112895966, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 130.9375, "completions/mean_terminated_length": 130.9375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.4288760280231494, "grad_norm": 4.196553707122803, "kl": 8.6201171875, "learning_rate": 1.2228766480981678e-05, "loss": 0.3969, "num_tokens": 37098219.0, "reward": -5.6112060546875, "reward_std": 0.9984546303749084, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -14.902099609375, "rewards/ppl_reward/std": 11.194435119628906, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1249379813671112, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 130.4375, "completions/mean_terminated_length": 130.4375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 2.4300944258300334, "grad_norm": 1.5500130653381348, "kl": 5.00390625, "learning_rate": 1.2220466023580646e-05, "loss": 0.1856, "num_tokens": 37113511.0, "reward": -0.919677734375, "reward_std": 0.8475956916809082, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.50341796875, "rewards/ppl_reward/std": 3.038969039916992, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11404092609882355, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 135.265625, "completions/mean_terminated_length": 135.265625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 2.4313128236369175, "grad_norm": 1.875632882118225, "kl": 4.15625, "learning_rate": 1.2212163956679106e-05, "loss": 0.0568, "num_tokens": 37129416.0, "reward": -0.20880126953125, "reward_std": 0.9019309282302856, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -3.8941650390625, "rewards/ppl_reward/std": 2.261827230453491, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.23302355408668518, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 128.90625, "completions/mean_terminated_length": 128.90625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 2.4325312214438015, "grad_norm": 2.4760172367095947, "kl": 5.234375, "learning_rate": 1.220386028629479e-05, "loss": 0.1859, "num_tokens": 37144930.0, "reward": -0.9598388671875, "reward_std": 0.7726154327392578, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.411865234375, "rewards/ppl_reward/std": 2.317204713821411, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17390352487564087, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 129.875, "completions/mean_terminated_length": 129.875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.433749619250685, "grad_norm": 3.3435375690460205, "kl": 6.1796875, "learning_rate": 1.2195555018446598e-05, "loss": 0.162, "num_tokens": 37159914.0, "reward": -2.8157958984375, "reward_std": 0.7712975740432739, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -9.225341796875, "rewards/ppl_reward/std": 7.006827354431152, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 139.1875, "completions/mean_terminated_length": 139.1875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.434968017057569, "grad_norm": 2.603928327560425, "kl": 6.40625, "learning_rate": 1.218724815915459e-05, "loss": 0.2251, "num_tokens": 37175894.0, "reward": -0.801025390625, "reward_std": 1.052117109298706, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -4.93017578125, "rewards/ppl_reward/std": 2.946763753890991, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.19527530670166016, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 134.390625, "completions/mean_terminated_length": 134.390625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 2.436186414864453, "grad_norm": 3.6861274242401123, "kl": 8.2734375, "learning_rate": 1.217893971443998e-05, "loss": 0.2687, "num_tokens": 37191583.0, "reward": -0.8526611328125, "reward_std": 1.1631555557250977, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -4.986572265625, "rewards/ppl_reward/std": 2.7175679206848145, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.23935678601264954, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 174.671875, "completions/mean_terminated_length": 174.671875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.4374048126713372, "grad_norm": 1.765749216079712, "kl": 5.15234375, "learning_rate": 1.2170629690325117e-05, "loss": 0.2046, "num_tokens": 37210122.0, "reward": -3.07501220703125, "reward_std": 2.204066514968872, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -9.7125244140625, "rewards/ppl_reward/std": 14.801548957824707, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 146.65625, "completions/mean_terminated_length": 146.65625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.4386232104782213, "grad_norm": 3.2237977981567383, "kl": 8.1484375, "learning_rate": 1.2162318092833513e-05, "loss": 0.2717, "num_tokens": 37226660.0, "reward": -1.9986572265625, "reward_std": 1.2120416164398193, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -7.255126953125, "rewards/ppl_reward/std": 4.640090465545654, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.21271435916423798, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 138.890625, "completions/mean_terminated_length": 138.890625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.4398416082851053, "grad_norm": 3.0719704627990723, "kl": 5.7421875, "learning_rate": 1.2154004927989815e-05, "loss": 0.2546, "num_tokens": 37242717.0, "reward": -0.26031494140625, "reward_std": 0.5936108827590942, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.0753173828125, "rewards/ppl_reward/std": 1.643215537071228, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09827063977718353, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.441060006091989, "grad_norm": 1.906171202659607, "kl": 5.1796875, "learning_rate": 1.21456902018198e-05, "loss": 0.222, "num_tokens": 37257805.0, "reward": -1.6854248046875, "reward_std": 0.8164541125297546, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -6.769287109375, "rewards/ppl_reward/std": 4.3219194412231445, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17951758205890656, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 150.46875, "completions/mean_terminated_length": 150.46875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.442278403898873, "grad_norm": 2.1267848014831543, "kl": 4.66015625, "learning_rate": 1.2137373920350386e-05, "loss": 0.2043, "num_tokens": 37274803.0, "reward": -0.01104736328125, "reward_std": 0.4945654273033142, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -3.5767822265625, "rewards/ppl_reward/std": 1.1726012229919434, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1677328199148178, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 131.515625, "completions/mean_terminated_length": 131.515625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.443496801705757, "grad_norm": 1.7281712293624878, "kl": 4.3515625, "learning_rate": 1.212905608960961e-05, "loss": 0.1437, "num_tokens": 37289756.0, "reward": -1.9544677734375, "reward_std": 1.5613563060760498, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -7.268310546875, "rewards/ppl_reward/std": 4.541396617889404, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 131.171875, "completions/mean_terminated_length": 131.171875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 2.444715199512641, "grad_norm": 2.249476671218872, "kl": 3.046875, "learning_rate": 1.2120736715626637e-05, "loss": 0.0273, "num_tokens": 37304511.0, "reward": -1.2943115234375, "reward_std": 0.9653441905975342, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.197998046875, "rewards/ppl_reward/std": 4.8287153244018555, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18617255985736847, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 131.984375, "completions/mean_terminated_length": 131.984375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 2.4459335973195246, "grad_norm": 2.2529690265655518, "kl": 4.8125, "learning_rate": 1.2112415804431748e-05, "loss": 0.1811, "num_tokens": 37319638.0, "reward": -1.072021484375, "reward_std": 0.941142201423645, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -5.49560546875, "rewards/ppl_reward/std": 3.0685648918151855, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.224347323179245, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 145.328125, "completions/mean_terminated_length": 145.328125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 2.4471519951264087, "grad_norm": 2.107465982437134, "kl": 1.76171875, "learning_rate": 1.2104093362056341e-05, "loss": 0.0105, "num_tokens": 37335451.0, "reward": -0.9754638671875, "reward_std": 0.30879098176956177, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.802490234375, "rewards/ppl_reward/std": 2.4934134483337402, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 132.734375, "completions/mean_terminated_length": 132.734375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.4483703929332927, "grad_norm": 2.20664119720459, "kl": 3.072265625, "learning_rate": 1.2095769394532924e-05, "loss": 0.0829, "num_tokens": 37350682.0, "reward": -1.2620849609375, "reward_std": 0.6763539910316467, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.219482421875, "rewards/ppl_reward/std": 2.680122137069702, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 141.796875, "completions/mean_terminated_length": 141.796875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.4495887907401768, "grad_norm": 2.936558961868286, "kl": 3.21484375, "learning_rate": 1.2087443907895102e-05, "loss": 0.0753, "num_tokens": 37366989.0, "reward": -1.22265625, "reward_std": 0.6716411113739014, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.140625, "rewards/ppl_reward/std": 4.001919746398926, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 140.21875, "completions/mean_terminated_length": 140.21875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 2.450807188547061, "grad_norm": 1.7887388467788696, "kl": 2.87109375, "learning_rate": 1.2079116908177592e-05, "loss": 0.0587, "num_tokens": 37383131.0, "reward": -2.679931640625, "reward_std": 0.6234809756278992, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -9.13330078125, "rewards/ppl_reward/std": 12.978931427001953, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 145.71875, "completions/mean_terminated_length": 145.71875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.4520255863539444, "grad_norm": 1.7772456407546997, "kl": 2.890625, "learning_rate": 1.2070788401416209e-05, "loss": 0.0885, "num_tokens": 37399697.0, "reward": -3.40625, "reward_std": 0.41324344277381897, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -10.625, "rewards/ppl_reward/std": 11.527924537658691, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 139.765625, "completions/mean_terminated_length": 139.765625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.4532439841608285, "grad_norm": 1.4166362285614014, "kl": 1.98828125, "learning_rate": 1.206245839364785e-05, "loss": 0.0141, "num_tokens": 37416042.0, "reward": -0.1456298828125, "reward_std": 0.3952408730983734, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.135009765625, "rewards/ppl_reward/std": 1.5867327451705933, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 137.15625, "completions/mean_terminated_length": 137.15625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.4544623819677125, "grad_norm": 2.3789498805999756, "kl": 3.9541015625, "learning_rate": 1.2054126890910499e-05, "loss": 0.1376, "num_tokens": 37431924.0, "reward": -3.595947265625, "reward_std": 0.7153322696685791, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -10.97314453125, "rewards/ppl_reward/std": 15.132291793823242, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 137.8125, "completions/mean_terminated_length": 137.8125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 2.4556807797745965, "grad_norm": 2.4377074241638184, "kl": 5.88671875, "learning_rate": 1.2045793899243238e-05, "loss": 0.2521, "num_tokens": 37447664.0, "reward": -2.62042236328125, "reward_std": 1.6246684789657593, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.8424072265625, "rewards/ppl_reward/std": 11.00524616241455, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2212863266468048, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 128.828125, "completions/mean_terminated_length": 128.828125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 2.45689917758148, "grad_norm": 2.841428756713867, "kl": 7.1171875, "learning_rate": 1.203745942468622e-05, "loss": 0.2357, "num_tokens": 37462749.0, "reward": -2.2137451171875, "reward_std": 1.4347466230392456, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.857177734375, "rewards/ppl_reward/std": 9.121424674987793, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.24241341650485992, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 163.09375, "completions/mean_terminated_length": 163.09375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.458117575388364, "grad_norm": 4.7718634605407715, "kl": 11.2734375, "learning_rate": 1.2029123473280668e-05, "loss": 0.5286, "num_tokens": 37480867.0, "reward": -7.1151123046875, "reward_std": 11.294007301330566, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -17.753662109375, "rewards/ppl_reward/std": 64.82366943359375, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 130.3125, "completions/mean_terminated_length": 130.3125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.459335973195248, "grad_norm": 1.9487491846084595, "kl": 5.177734375, "learning_rate": 1.2020786051068887e-05, "loss": 0.0988, "num_tokens": 37495887.0, "reward": -0.8780517578125, "reward_std": 1.015960931777954, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.349853515625, "rewards/ppl_reward/std": 2.5656399726867676, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 145.9375, "completions/mean_terminated_length": 145.9375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 2.4605543710021323, "grad_norm": 2.2857885360717773, "kl": 2.6181640625, "learning_rate": 1.2012447164094232e-05, "loss": -0.0161, "num_tokens": 37512747.0, "reward": -2.35650634765625, "reward_std": 0.7729240655899048, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.4473876953125, "rewards/ppl_reward/std": 6.561099052429199, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.21578919887542725, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 130.453125, "completions/mean_terminated_length": 130.453125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 2.4617727688090163, "grad_norm": 2.372260808944702, "kl": 3.6875, "learning_rate": 1.2004106818401135e-05, "loss": 0.093, "num_tokens": 37528152.0, "reward": -0.5950927734375, "reward_std": 0.41000646352767944, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.010498046875, "rewards/ppl_reward/std": 1.6975950002670288, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 148.71875, "completions/mean_terminated_length": 148.71875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.4629911666159003, "grad_norm": 2.8147964477539062, "kl": 7.6640625, "learning_rate": 1.1995765020035081e-05, "loss": 0.3214, "num_tokens": 37545598.0, "reward": -0.915771484375, "reward_std": 0.9907498955726624, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.44873046875, "rewards/ppl_reward/std": 2.4354093074798584, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 138.015625, "completions/mean_terminated_length": 138.015625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.464209564422784, "grad_norm": 1.670095682144165, "kl": 4.8818359375, "learning_rate": 1.1987421775042605e-05, "loss": 0.184, "num_tokens": 37561367.0, "reward": -0.8509521484375, "reward_std": 0.6764846444129944, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.420654296875, "rewards/ppl_reward/std": 3.031831979751587, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14689241349697113, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 154.703125, "completions/mean_terminated_length": 154.703125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 2.465427962229668, "grad_norm": 2.8096535205841064, "kl": 5.845703125, "learning_rate": 1.1979077089471288e-05, "loss": 0.2155, "num_tokens": 37579196.0, "reward": -0.72515869140625, "reward_std": 0.6671391725540161, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.1143798828125, "rewards/ppl_reward/std": 2.327827215194702, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 155.078125, "completions/mean_terminated_length": 155.078125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.466646360036552, "grad_norm": 2.977537155151367, "kl": 4.2412109375, "learning_rate": 1.1970730969369764e-05, "loss": 0.1099, "num_tokens": 37597385.0, "reward": -1.5838623046875, "reward_std": 0.535077691078186, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.894287109375, "rewards/ppl_reward/std": 3.562870979309082, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 138.78125, "completions/mean_terminated_length": 138.78125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.467864757843436, "grad_norm": 3.309960126876831, "kl": 7.421875, "learning_rate": 1.19623834207877e-05, "loss": 0.2045, "num_tokens": 37613067.0, "reward": -1.6021728515625, "reward_std": 1.0365407466888428, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.743408203125, "rewards/ppl_reward/std": 5.180017948150635, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18225978314876556, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 129.546875, "completions/mean_terminated_length": 129.546875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 2.4690831556503197, "grad_norm": 1.389154076576233, "kl": 3.296875, "learning_rate": 1.1954034449775802e-05, "loss": 0.1073, "num_tokens": 37627646.0, "reward": -0.732421875, "reward_std": 1.0647003650665283, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.21484375, "rewards/ppl_reward/std": 2.946725606918335, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1298656165599823, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 127.421875, "completions/mean_terminated_length": 127.421875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.4703015534572037, "grad_norm": 2.9331908226013184, "kl": 5.890625, "learning_rate": 1.1945684062385802e-05, "loss": 0.1978, "num_tokens": 37642073.0, "reward": -5.125, "reward_std": 1.5141174793243408, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -13.671875, "rewards/ppl_reward/std": 22.018672943115234, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.250866562128067, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 130.46875, "completions/mean_terminated_length": 130.46875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.4715199512640877, "grad_norm": 2.7549941539764404, "kl": 7.62890625, "learning_rate": 1.193733226467047e-05, "loss": 0.345, "num_tokens": 37656991.0, "reward": -2.513671875, "reward_std": 2.167558431625366, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.50390625, "rewards/ppl_reward/std": 10.800296783447266, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2372427135705948, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 132.015625, "completions/mean_terminated_length": 132.015625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 2.472738349070972, "grad_norm": 1.5245577096939087, "kl": 2.369140625, "learning_rate": 1.1928979062683579e-05, "loss": -0.0425, "num_tokens": 37671960.0, "reward": -3.77392578125, "reward_std": 4.280478000640869, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -10.9775390625, "rewards/ppl_reward/std": 23.692903518676758, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.25439465045928955, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 127.640625, "completions/mean_terminated_length": 127.640625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.473956746877856, "grad_norm": 1.6515864133834839, "kl": 1.732421875, "learning_rate": 1.1920624462479941e-05, "loss": -0.0037, "num_tokens": 37686705.0, "reward": -1.8101806640625, "reward_std": 0.7257195711135864, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.471923828125, "rewards/ppl_reward/std": 5.761387348175049, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 152.65625, "completions/mean_terminated_length": 152.65625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.4751751446847394, "grad_norm": 2.76177978515625, "kl": 6.796875, "learning_rate": 1.191226847011537e-05, "loss": 0.2356, "num_tokens": 37703699.0, "reward": -0.9766845703125, "reward_std": 1.0628801584243774, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.320556640625, "rewards/ppl_reward/std": 1.8834774494171143, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.22100594639778137, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 133.53125, "completions/mean_terminated_length": 133.53125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.4763935424916235, "grad_norm": 12.233492851257324, "kl": 3.2109375, "learning_rate": 1.1903911091646684e-05, "loss": 0.041, "num_tokens": 37718861.0, "reward": -0.6527099609375, "reward_std": 0.7991209030151367, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.110107421875, "rewards/ppl_reward/std": 3.8274600505828857, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 142.21875, "completions/mean_terminated_length": 142.21875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 2.4776119402985075, "grad_norm": 2.073573350906372, "kl": 5.0078125, "learning_rate": 1.1895552333131721e-05, "loss": 0.1617, "num_tokens": 37734955.0, "reward": -12.43902587890625, "reward_std": 8.742303848266602, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -28.3233642578125, "rewards/ppl_reward/std": 75.83422088623047, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2212863266468048, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 153.75, "completions/mean_terminated_length": 153.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 2.4788303381053916, "grad_norm": 2.3957784175872803, "kl": 3.125, "learning_rate": 1.188719220062931e-05, "loss": 0.1735, "num_tokens": 37752243.0, "reward": -0.93701171875, "reward_std": 0.7684550881385803, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.6005859375, "rewards/ppl_reward/std": 2.1889467239379883, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 133.6875, "completions/mean_terminated_length": 133.6875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.480048735912275, "grad_norm": 1.7107899188995361, "kl": 2.822265625, "learning_rate": 1.187883070019927e-05, "loss": 0.0792, "num_tokens": 37767775.0, "reward": -1.521484375, "reward_std": 0.8689013719558716, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.78515625, "rewards/ppl_reward/std": 4.722762584686279, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 166.453125, "completions/mean_terminated_length": 166.453125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.481267133719159, "grad_norm": 1.8903708457946777, "kl": 3.8125, "learning_rate": 1.1870467837902426e-05, "loss": 0.1262, "num_tokens": 37787236.0, "reward": -0.75372314453125, "reward_std": 0.4018515348434448, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.2183837890625, "rewards/ppl_reward/std": 2.1864051818847656, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 148.96875, "completions/mean_terminated_length": 148.96875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.4824855315260432, "grad_norm": 2.660940647125244, "kl": 4.11328125, "learning_rate": 1.186210361980058e-05, "loss": 0.1818, "num_tokens": 37804458.0, "reward": -0.684814453125, "reward_std": 0.4971808195114136, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.99462890625, "rewards/ppl_reward/std": 2.932785749435425, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 139.140625, "completions/mean_terminated_length": 139.140625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.4837039293329273, "grad_norm": 2.176255702972412, "kl": 3.53515625, "learning_rate": 1.1853738051956518e-05, "loss": 0.1745, "num_tokens": 37820123.0, "reward": -0.5396728515625, "reward_std": 0.8501957654953003, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.852783203125, "rewards/ppl_reward/std": 3.3126447200775146, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 137.90625, "completions/mean_terminated_length": 137.90625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 2.4849223271398113, "grad_norm": 1.957270860671997, "kl": 3.71484375, "learning_rate": 1.1845371140434008e-05, "loss": 0.1929, "num_tokens": 37835613.0, "reward": -1.2366943359375, "reward_std": 0.4817035496234894, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.301513671875, "rewards/ppl_reward/std": 3.9478487968444824, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11545931547880173, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 144.796875, "completions/mean_terminated_length": 144.796875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.486140724946695, "grad_norm": 2.6631720066070557, "kl": 5.767578125, "learning_rate": 1.183700289129779e-05, "loss": 0.233, "num_tokens": 37851984.0, "reward": -1.864501953125, "reward_std": 0.6557574272155762, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.43994140625, "rewards/ppl_reward/std": 3.1955714225769043, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1249379813671112, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 139.109375, "completions/mean_terminated_length": 139.109375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.487359122753579, "grad_norm": 1.6728054285049438, "kl": 2.333984375, "learning_rate": 1.1828633310613569e-05, "loss": 0.0242, "num_tokens": 37868463.0, "reward": -1.15020751953125, "reward_std": 0.38549935817718506, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.0894775390625, "rewards/ppl_reward/std": 4.19455623626709, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 142.484375, "completions/mean_terminated_length": 142.484375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 2.488577520560463, "grad_norm": 1.3523298501968384, "kl": 2.8203125, "learning_rate": 1.1820262404448023e-05, "loss": 0.0872, "num_tokens": 37884990.0, "reward": -0.9520263671875, "reward_std": 0.5159232020378113, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.607177734375, "rewards/ppl_reward/std": 3.2222886085510254, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13524486124515533, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 129.984375, "completions/mean_terminated_length": 129.984375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.489795918367347, "grad_norm": 1.4997049570083618, "kl": 2.298828125, "learning_rate": 1.1811890178868785e-05, "loss": 0.038, "num_tokens": 37899845.0, "reward": -1.45556640625, "reward_std": 0.6734703183174133, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.7392578125, "rewards/ppl_reward/std": 4.264693260192871, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 133.46875, "completions/mean_terminated_length": 133.46875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.491014316174231, "grad_norm": 5.6199212074279785, "kl": 5.69140625, "learning_rate": 1.1803516639944452e-05, "loss": 0.1791, "num_tokens": 37915579.0, "reward": -2.37353515625, "reward_std": 1.8465094566345215, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.2470703125, "rewards/ppl_reward/std": 9.106461524963379, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.23779743909835815, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 128.359375, "completions/mean_terminated_length": 128.359375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.4922327139811147, "grad_norm": 2.8410701751708984, "kl": 4.45703125, "learning_rate": 1.179514179374456e-05, "loss": 0.1459, "num_tokens": 37930882.0, "reward": -2.9490966796875, "reward_std": 0.6440157294273376, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -9.773193359375, "rewards/ppl_reward/std": 7.596728324890137, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 130.078125, "completions/mean_terminated_length": 130.078125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.4934511117879987, "grad_norm": 2.5944864749908447, "kl": 6.5068359375, "learning_rate": 1.1786765646339601e-05, "loss": 0.2985, "num_tokens": 37945511.0, "reward": -0.6898193359375, "reward_std": 1.0093884468078613, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.028076171875, "rewards/ppl_reward/std": 2.9537622928619385, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21007457375526428, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 113.296875, "completions/mean_terminated_length": 113.296875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 2.4946695095948828, "grad_norm": 3.8521041870117188, "kl": 8.4375, "learning_rate": 1.1778388203801019e-05, "loss": 0.2821, "num_tokens": 37958602.0, "reward": -1.91552734375, "reward_std": 1.8215707540512085, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.3623046875, "rewards/ppl_reward/std": 7.592586040496826, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.125, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 128.78125, "completions/mean_terminated_length": 128.78125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.495887907401767, "grad_norm": 4.088205337524414, "kl": 9.76953125, "learning_rate": 1.1770009472201177e-05, "loss": 0.5174, "num_tokens": 37973332.0, "reward": -3.41693115234375, "reward_std": 1.1538488864898682, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -10.5369873046875, "rewards/ppl_reward/std": 11.978854179382324, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.20638974010944366, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 136.296875, "completions/mean_terminated_length": 136.296875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.497106305208651, "grad_norm": 4.434811115264893, "kl": 11.15625, "learning_rate": 1.1761629457613382e-05, "loss": 0.4962, "num_tokens": 37989303.0, "reward": -5.4254150390625, "reward_std": 2.5825297832489014, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -14.327392578125, "rewards/ppl_reward/std": 19.885284423828125, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1526367962360382, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 136.109375, "completions/mean_terminated_length": 136.109375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 2.4983247030155344, "grad_norm": 1.5600820779800415, "kl": 3.9921875, "learning_rate": 1.175324816611188e-05, "loss": 0.1112, "num_tokens": 38004350.0, "reward": -1.407470703125, "reward_std": 0.45304325222969055, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.58056640625, "rewards/ppl_reward/std": 3.1921777725219727, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 137.203125, "completions/mean_terminated_length": 137.203125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.4995431008224185, "grad_norm": 2.56284499168396, "kl": 8.81640625, "learning_rate": 1.1744865603771825e-05, "loss": 0.48, "num_tokens": 38019955.0, "reward": -2.050048828125, "reward_std": 1.0508556365966797, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.70166015625, "rewards/ppl_reward/std": 3.9721572399139404, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 130.296875, "completions/mean_terminated_length": 130.296875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 2.5007614986293025, "grad_norm": 1.5828948020935059, "kl": 4.0732421875, "learning_rate": 1.1736481776669307e-05, "loss": 0.1553, "num_tokens": 38035126.0, "reward": -1.4254150390625, "reward_std": 0.6528881192207336, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.632080078125, "rewards/ppl_reward/std": 3.1542465686798096, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 131.796875, "completions/mean_terminated_length": 131.796875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.5019798964361866, "grad_norm": 2.0425453186035156, "kl": 4.6171875, "learning_rate": 1.1728096690881323e-05, "loss": 0.1665, "num_tokens": 38050465.0, "reward": -0.6741943359375, "reward_std": 0.7235932946205139, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.981201171875, "rewards/ppl_reward/std": 2.813138246536255, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16512493789196014, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 129.078125, "completions/mean_terminated_length": 129.078125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.50319829424307, "grad_norm": 1.823939323425293, "kl": 4.1845703125, "learning_rate": 1.1719710352485786e-05, "loss": 0.143, "num_tokens": 38065366.0, "reward": -1.166015625, "reward_std": 1.2876007556915283, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.10546875, "rewards/ppl_reward/std": 6.773080825805664, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15344709157943726, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 146.3125, "completions/mean_terminated_length": 146.3125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 2.504416692049954, "grad_norm": 2.658968687057495, "kl": 8.109375, "learning_rate": 1.1711322767561522e-05, "loss": 0.3688, "num_tokens": 38081546.0, "reward": -0.4149169921875, "reward_std": 0.6905431747436523, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.329833984375, "rewards/ppl_reward/std": 2.8494162559509277, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17817416787147522, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 135.015625, "completions/mean_terminated_length": 135.015625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 2.5056350898568382, "grad_norm": 1.8582347631454468, "kl": 4.0078125, "learning_rate": 1.1702933942188252e-05, "loss": 0.1192, "num_tokens": 38097163.0, "reward": -0.35198974609375, "reward_std": 0.597427487373352, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.4774169921875, "rewards/ppl_reward/std": 2.8586790561676025, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15344709157943726, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 138.125, "completions/mean_terminated_length": 138.125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.5068534876637223, "grad_norm": 2.1871674060821533, "kl": 5.86328125, "learning_rate": 1.1694543882446603e-05, "loss": 0.1572, "num_tokens": 38113371.0, "reward": -1.1072998046875, "reward_std": 1.0843020677566528, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.628662109375, "rewards/ppl_reward/std": 3.467911720275879, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2121305763721466, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.5080718854706063, "grad_norm": 2.3824684619903564, "kl": 3.41015625, "learning_rate": 1.168615259441809e-05, "loss": 0.1344, "num_tokens": 38129835.0, "reward": -0.43212890625, "reward_std": 0.9731181859970093, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.5048828125, "rewards/ppl_reward/std": 4.12286901473999, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.2376670390367508, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 129.15625, "completions/mean_terminated_length": 129.15625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.5092902832774904, "grad_norm": 3.9977071285247803, "kl": 3.73828125, "learning_rate": 1.1677760084185123e-05, "loss": 0.0968, "num_tokens": 38145197.0, "reward": -0.8048095703125, "reward_std": 0.5864834785461426, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.281494140625, "rewards/ppl_reward/std": 2.924968957901001, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 133.046875, "completions/mean_terminated_length": 133.046875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 2.510508681084374, "grad_norm": 1.7894320487976074, "kl": 3.3115234375, "learning_rate": 1.1669366357830996e-05, "loss": 0.0672, "num_tokens": 38160384.0, "reward": -0.75, "reward_std": 0.5476000905036926, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.28125, "rewards/ppl_reward/std": 3.4491629600524902, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 136.53125, "completions/mean_terminated_length": 136.53125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 2.511727078891258, "grad_norm": 3.9250106811523438, "kl": 4.73828125, "learning_rate": 1.1660971421439889e-05, "loss": 0.2685, "num_tokens": 38175914.0, "reward": -1.38037109375, "reward_std": 0.6153452396392822, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.4013671875, "rewards/ppl_reward/std": 3.952709674835205, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 141.21875, "completions/mean_terminated_length": 141.21875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.512945476698142, "grad_norm": 2.3536465167999268, "kl": 4.298828125, "learning_rate": 1.165257528109685e-05, "loss": 0.123, "num_tokens": 38192360.0, "reward": -2.66162109375, "reward_std": 1.3909828662872314, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.9248046875, "rewards/ppl_reward/std": 5.756845474243164, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 138.796875, "completions/mean_terminated_length": 138.796875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 2.5141638745050257, "grad_norm": 1.832557201385498, "kl": 3.6923828125, "learning_rate": 1.1644177942887812e-05, "loss": 0.1664, "num_tokens": 38208275.0, "reward": -0.7838134765625, "reward_std": 0.3460676968097687, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.333251953125, "rewards/ppl_reward/std": 2.173292398452759, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 135.390625, "completions/mean_terminated_length": 135.390625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.5153822723119097, "grad_norm": 2.250452995300293, "kl": 2.919921875, "learning_rate": 1.1635779412899562e-05, "loss": 0.1486, "num_tokens": 38223964.0, "reward": -1.2554931640625, "reward_std": 0.5287381410598755, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.300048828125, "rewards/ppl_reward/std": 3.2587597370147705, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 151.265625, "completions/mean_terminated_length": 151.265625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 2.5166006701187937, "grad_norm": 2.6513638496398926, "kl": 5.12109375, "learning_rate": 1.1627379697219762e-05, "loss": 0.2881, "num_tokens": 38241397.0, "reward": -1.279541015625, "reward_std": 0.7399271130561829, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.23876953125, "rewards/ppl_reward/std": 3.5083887577056885, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1249379813671112, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 141.71875, "completions/mean_terminated_length": 141.71875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.517819067925678, "grad_norm": 4.214537143707275, "kl": 6.17578125, "learning_rate": 1.1618978801936933e-05, "loss": 0.394, "num_tokens": 38257979.0, "reward": -0.66680908203125, "reward_std": 0.6804684996604919, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.0211181640625, "rewards/ppl_reward/std": 3.125487804412842, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 127.03125, "completions/mean_terminated_length": 127.03125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.519037465732562, "grad_norm": 2.6901113986968994, "kl": 5.76171875, "learning_rate": 1.161057673314044e-05, "loss": 0.2178, "num_tokens": 38272413.0, "reward": -0.9862060546875, "reward_std": 1.0263557434082031, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.527099609375, "rewards/ppl_reward/std": 3.544422149658203, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18496133387088776, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 133.28125, "completions/mean_terminated_length": 133.28125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 2.520255863539446, "grad_norm": 3.2787349224090576, "kl": 7.421875, "learning_rate": 1.160217349692051e-05, "loss": 0.3338, "num_tokens": 38288063.0, "reward": -2.1553955078125, "reward_std": 0.8914797902107239, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.842041015625, "rewards/ppl_reward/std": 3.5105671882629395, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2221602201461792, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 138.046875, "completions/mean_terminated_length": 138.046875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.5214742613463295, "grad_norm": 3.5732715129852295, "kl": 8.552734375, "learning_rate": 1.1593769099368218e-05, "loss": 0.3549, "num_tokens": 38304218.0, "reward": -0.826904296875, "reward_std": 1.0060168504714966, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -4.98974609375, "rewards/ppl_reward/std": 3.039914608001709, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.25946253538131714, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 126.1875, "completions/mean_terminated_length": 126.1875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.5226926591532135, "grad_norm": 3.8750076293945312, "kl": 9.015625, "learning_rate": 1.1585363546575468e-05, "loss": 0.337, "num_tokens": 38318910.0, "reward": -2.185791015625, "reward_std": 1.5648473501205444, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.84814453125, "rewards/ppl_reward/std": 5.605151653289795, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2025614231824875, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 120.609375, "completions/mean_terminated_length": 120.609375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.5239110569600975, "grad_norm": 2.850207567214966, "kl": 6.294921875, "learning_rate": 1.157695684463501e-05, "loss": 0.2507, "num_tokens": 38333285.0, "reward": -1.2169189453125, "reward_std": 0.7916715145111084, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.066650390625, "rewards/ppl_reward/std": 3.5784549713134766, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1302827149629593, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 134.265625, "completions/mean_terminated_length": 134.265625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 2.5251294547669816, "grad_norm": 2.507615327835083, "kl": 4.91796875, "learning_rate": 1.1568548999640428e-05, "loss": 0.2166, "num_tokens": 38348830.0, "reward": -1.5120849609375, "reward_std": 0.46108371019363403, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.735107421875, "rewards/ppl_reward/std": 5.073060989379883, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 144.328125, "completions/mean_terminated_length": 144.328125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 2.526347852573865, "grad_norm": 1.775431752204895, "kl": 7.9609375, "learning_rate": 1.156014001768613e-05, "loss": 0.3979, "num_tokens": 38365899.0, "reward": -1.8880615234375, "reward_std": 1.2932419776916504, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -7.096435546875, "rewards/ppl_reward/std": 6.494823455810547, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.24722543358802795, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 132.015625, "completions/mean_terminated_length": 132.015625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 2.5275662503807492, "grad_norm": 1.8791038990020752, "kl": 4.240234375, "learning_rate": 1.1551729904867353e-05, "loss": 0.1339, "num_tokens": 38381252.0, "reward": -0.71990966796875, "reward_std": 0.6544289588928223, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.1273193359375, "rewards/ppl_reward/std": 2.3349382877349854, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.2136233150959015, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 132.6875, "completions/mean_terminated_length": 132.6875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 2.5287846481876333, "grad_norm": 1.59870183467865, "kl": 7.8359375, "learning_rate": 1.1543318667280148e-05, "loss": 0.392, "num_tokens": 38396216.0, "reward": -2.97705078125, "reward_std": 2.9260263442993164, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -9.4619140625, "rewards/ppl_reward/std": 8.605233192443848, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.288003146648407, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 139.328125, "completions/mean_terminated_length": 139.328125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 2.5300030459945173, "grad_norm": 1.6894738674163818, "kl": 4.77734375, "learning_rate": 1.1534906311021386e-05, "loss": 0.1014, "num_tokens": 38412853.0, "reward": -0.507080078125, "reward_std": 0.8942475318908691, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.60009765625, "rewards/ppl_reward/std": 2.2013099193573, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.25219154357910156, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 128.453125, "completions/mean_terminated_length": 128.453125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 2.5312214438014013, "grad_norm": 1.729631781578064, "kl": 5.8671875, "learning_rate": 1.1526492842188746e-05, "loss": 0.2541, "num_tokens": 38427954.0, "reward": -2.106689453125, "reward_std": 1.3780485391616821, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.69775390625, "rewards/ppl_reward/std": 6.765411853790283, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.244862899184227, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 116.859375, "completions/mean_terminated_length": 116.859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.532439841608285, "grad_norm": 2.097963333129883, "kl": 2.689453125, "learning_rate": 1.1518078266880717e-05, "loss": 0.0286, "num_tokens": 38441921.0, "reward": -2.193115234375, "reward_std": 1.1068298816680908, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.06591796875, "rewards/ppl_reward/std": 3.9555447101593018, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1846257895231247, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 122.6875, "completions/mean_terminated_length": 122.6875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 2.533658239415169, "grad_norm": 1.5885149240493774, "kl": 4.513671875, "learning_rate": 1.1509662591196587e-05, "loss": 0.1214, "num_tokens": 38456485.0, "reward": -2.8543701171875, "reward_std": 1.1086355447769165, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -9.349365234375, "rewards/ppl_reward/std": 7.109326362609863, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1423913538455963, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 125.796875, "completions/mean_terminated_length": 125.796875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.534876637222053, "grad_norm": 1.4123319387435913, "kl": 3.3837890625, "learning_rate": 1.150124582123644e-05, "loss": 0.079, "num_tokens": 38471288.0, "reward": -0.084716796875, "reward_std": 0.38758549094200134, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -3.98974609375, "rewards/ppl_reward/std": 1.9681061506271362, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 128.671875, "completions/mean_terminated_length": 128.671875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 2.536095035028937, "grad_norm": 2.071100950241089, "kl": 2.697265625, "learning_rate": 1.1492827963101159e-05, "loss": 0.061, "num_tokens": 38486171.0, "reward": -0.3306884765625, "reward_std": 0.3849596381187439, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.505126953125, "rewards/ppl_reward/std": 2.1559805870056152, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 128.5625, "completions/mean_terminated_length": 128.5625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.5373134328358207, "grad_norm": 4.10507869720459, "kl": 2.90234375, "learning_rate": 1.1484409022892406e-05, "loss": 0.0364, "num_tokens": 38500943.0, "reward": -2.0701904296875, "reward_std": 1.4642359018325806, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.898193359375, "rewards/ppl_reward/std": 5.843211650848389, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 122.1875, "completions/mean_terminated_length": 122.1875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.5385318306427047, "grad_norm": 1.53372323513031, "kl": 5.5703125, "learning_rate": 1.1475989006712643e-05, "loss": 0.1623, "num_tokens": 38515211.0, "reward": -0.1300048828125, "reward_std": 0.762869119644165, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -3.838134765625, "rewards/ppl_reward/std": 1.2113454341888428, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 133.1875, "completions/mean_terminated_length": 133.1875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.5397502284495888, "grad_norm": 3.154738664627075, "kl": 5.88671875, "learning_rate": 1.1467567920665093e-05, "loss": 0.2739, "num_tokens": 38530303.0, "reward": -0.5128173828125, "reward_std": 0.6127802729606628, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.728759765625, "rewards/ppl_reward/std": 2.7066235542297363, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 140.5625, "completions/mean_terminated_length": 140.5625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 2.540968626256473, "grad_norm": 1.5374068021774292, "kl": 3.84375, "learning_rate": 1.145914577085377e-05, "loss": 0.1348, "num_tokens": 38546091.0, "reward": -0.60601806640625, "reward_std": 0.5061246156692505, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.9229736328125, "rewards/ppl_reward/std": 1.8980743885040283, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 140.796875, "completions/mean_terminated_length": 140.796875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.542187024063357, "grad_norm": 1.47981858253479, "kl": 2.80078125, "learning_rate": 1.145072256338345e-05, "loss": 0.0446, "num_tokens": 38561982.0, "reward": -0.6209716796875, "reward_std": 0.5627610087394714, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.046630859375, "rewards/ppl_reward/std": 2.177314043045044, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 140.03125, "completions/mean_terminated_length": 140.03125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 2.543405421870241, "grad_norm": 1.5889171361923218, "kl": 5.03125, "learning_rate": 1.144229830435968e-05, "loss": 0.0896, "num_tokens": 38578248.0, "reward": -2.02001953125, "reward_std": 0.9935211539268494, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.5322265625, "rewards/ppl_reward/std": 4.185303211212158, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19024936854839325, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 138.21875, "completions/mean_terminated_length": 138.21875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 2.5446238196771245, "grad_norm": 1.6575472354888916, "kl": 4.42578125, "learning_rate": 1.1433872999888771e-05, "loss": 0.0866, "num_tokens": 38594118.0, "reward": -3.4302978515625, "reward_std": 1.12252676486969, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -10.485595703125, "rewards/ppl_reward/std": 7.82016658782959, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 141.71875, "completions/mean_terminated_length": 141.71875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 2.5458422174840085, "grad_norm": 1.5455718040466309, "kl": 2.5986328125, "learning_rate": 1.1425446656077784e-05, "loss": 0.0815, "num_tokens": 38609700.0, "reward": -0.57720947265625, "reward_std": 0.7853226661682129, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.9981689453125, "rewards/ppl_reward/std": 4.91033935546875, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 139.390625, "completions/mean_terminated_length": 139.390625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 2.5470606152908926, "grad_norm": 1.4833494424819946, "kl": 4.14453125, "learning_rate": 1.1417019279034536e-05, "loss": 0.1524, "num_tokens": 38625293.0, "reward": -1.62548828125, "reward_std": 0.5150582790374756, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.0400390625, "rewards/ppl_reward/std": 3.5186798572540283, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 142.703125, "completions/mean_terminated_length": 142.703125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.548279013097776, "grad_norm": 2.1346402168273926, "kl": 5.341796875, "learning_rate": 1.1408590874867604e-05, "loss": 0.1578, "num_tokens": 38641786.0, "reward": -11.14501953125, "reward_std": 3.72320818901062, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -25.9072265625, "rewards/ppl_reward/std": 58.314422607421875, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15545432269573212, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 132.90625, "completions/mean_terminated_length": 132.90625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.54949741090466, "grad_norm": 1.784375786781311, "kl": 3.8046875, "learning_rate": 1.1400161449686293e-05, "loss": 0.0089, "num_tokens": 38656772.0, "reward": -1.7216796875, "reward_std": 0.9866641163825989, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.982421875, "rewards/ppl_reward/std": 3.71207332611084, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2212863266468048, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 155.4375, "completions/mean_terminated_length": 155.4375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.5507158087115442, "grad_norm": 2.3933331966400146, "kl": 6.1015625, "learning_rate": 1.1391731009600655e-05, "loss": 0.2192, "num_tokens": 38673880.0, "reward": -1.1201171875, "reward_std": 0.840438961982727, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.677734375, "rewards/ppl_reward/std": 2.374558687210083, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.24193336069583893, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 137.171875, "completions/mean_terminated_length": 137.171875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 2.5519342065184283, "grad_norm": 2.00834059715271, "kl": 4.2158203125, "learning_rate": 1.1383299560721483e-05, "loss": 0.0326, "num_tokens": 38689515.0, "reward": -0.19281005859375, "reward_std": 0.8751533627510071, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -3.9481201171875, "rewards/ppl_reward/std": 2.012047052383423, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19920477271080017, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 151.140625, "completions/mean_terminated_length": 151.140625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 2.5531526043253123, "grad_norm": 1.9187082052230835, "kl": 5.0546875, "learning_rate": 1.1374867109160295e-05, "loss": 0.1939, "num_tokens": 38706540.0, "reward": -1.5584716796875, "reward_std": 1.1085351705551147, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.765380859375, "rewards/ppl_reward/std": 4.253720283508301, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17354662716388702, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 148.71875, "completions/mean_terminated_length": 148.71875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 2.5543710021321964, "grad_norm": 2.7574703693389893, "kl": 5.5546875, "learning_rate": 1.1366433661029337e-05, "loss": 0.1842, "num_tokens": 38723130.0, "reward": -1.14691162109375, "reward_std": 0.7296314239501953, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.8719482421875, "rewards/ppl_reward/std": 2.9635863304138184, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21114179491996765, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 134.75, "completions/mean_terminated_length": 134.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 2.55558939993908, "grad_norm": 2.8896188735961914, "kl": 7.83984375, "learning_rate": 1.1357999222441573e-05, "loss": 0.2382, "num_tokens": 38738066.0, "reward": -8.0423583984375, "reward_std": 12.64868450164795, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -19.405029296875, "rewards/ppl_reward/std": 71.18456268310547, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.23302355408668518, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 169.9375, "completions/mean_terminated_length": 169.9375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 2.556807797745964, "grad_norm": 1.7190433740615845, "kl": 5.39453125, "learning_rate": 1.1349563799510698e-05, "loss": 0.2271, "num_tokens": 38757294.0, "reward": -0.186767578125, "reward_std": 0.46788209676742554, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.01416015625, "rewards/ppl_reward/std": 1.7878921031951904, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 153.015625, "completions/mean_terminated_length": 153.015625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 2.558026195552848, "grad_norm": 1.8128306865692139, "kl": 5.6171875, "learning_rate": 1.1341127398351106e-05, "loss": 0.1503, "num_tokens": 38774391.0, "reward": -3.04638671875, "reward_std": 1.7993178367614746, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -9.4755859375, "rewards/ppl_reward/std": 8.330133438110352, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2512061595916748, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 150.890625, "completions/mean_terminated_length": 150.890625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.559244593359732, "grad_norm": 1.3733971118927002, "kl": 3.83203125, "learning_rate": 1.1332690025077908e-05, "loss": 0.0379, "num_tokens": 38791120.0, "reward": -1.4716796875, "reward_std": 0.564237117767334, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.591796875, "rewards/ppl_reward/std": 5.3199639320373535, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 141.796875, "completions/mean_terminated_length": 141.796875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 2.5604629911666157, "grad_norm": 1.63856041431427, "kl": 4.3203125, "learning_rate": 1.132425168580692e-05, "loss": 0.0911, "num_tokens": 38806963.0, "reward": -2.7598876953125, "reward_std": 1.6115539073944092, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.980712890625, "rewards/ppl_reward/std": 7.801312446594238, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.22980836033821106, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 157.921875, "completions/mean_terminated_length": 157.921875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.5616813889734997, "grad_norm": 1.6771302223205566, "kl": 4.63671875, "learning_rate": 1.131581238665465e-05, "loss": 0.097, "num_tokens": 38823798.0, "reward": -5.1396484375, "reward_std": 3.4806900024414062, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -13.896484375, "rewards/ppl_reward/std": 25.119340896606445, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21931616961956024, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 145.046875, "completions/mean_terminated_length": 145.046875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.5628997867803838, "grad_norm": 2.0757484436035156, "kl": 4.169921875, "learning_rate": 1.130737213373831e-05, "loss": 0.1001, "num_tokens": 38839553.0, "reward": -0.5928955078125, "reward_std": 0.8459765315055847, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.771728515625, "rewards/ppl_reward/std": 2.731142044067383, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1489359587430954, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 155.15625, "completions/mean_terminated_length": 155.15625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.564118184587268, "grad_norm": 1.8779819011688232, "kl": 2.111328125, "learning_rate": 1.1298930933175805e-05, "loss": 0.038, "num_tokens": 38856491.0, "reward": -0.5037841796875, "reward_std": 0.4348207712173462, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.827880859375, "rewards/ppl_reward/std": 1.6227126121520996, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 144.59375, "completions/mean_terminated_length": 144.59375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.565336582394152, "grad_norm": 2.0891451835632324, "kl": 4.064453125, "learning_rate": 1.129048879108572e-05, "loss": 0.1672, "num_tokens": 38872313.0, "reward": -1.0582275390625, "reward_std": 0.8856350779533386, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.772705078125, "rewards/ppl_reward/std": 4.901930809020996, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 151.453125, "completions/mean_terminated_length": 151.453125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 2.566554980201036, "grad_norm": 2.0692873001098633, "kl": 3.1796875, "learning_rate": 1.1282045713587324e-05, "loss": 0.079, "num_tokens": 38888606.0, "reward": -0.479736328125, "reward_std": 0.779561460018158, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.63134765625, "rewards/ppl_reward/std": 1.582914113998413, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 145.015625, "completions/mean_terminated_length": 145.015625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.5677733780079195, "grad_norm": 1.8613210916519165, "kl": 4.9921875, "learning_rate": 1.127360170680057e-05, "loss": 0.1246, "num_tokens": 38904631.0, "reward": -0.92376708984375, "reward_std": 0.6880893707275391, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.4256591796875, "rewards/ppl_reward/std": 4.041312217712402, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21114179491996765, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 157.546875, "completions/mean_terminated_length": 157.546875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 2.5689917758148035, "grad_norm": 1.8829903602600098, "kl": 1.865234375, "learning_rate": 1.1265156776846074e-05, "loss": 0.0343, "num_tokens": 38921730.0, "reward": -0.5328369140625, "reward_std": 0.6292276382446289, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.979736328125, "rewards/ppl_reward/std": 2.5425212383270264, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 157.28125, "completions/mean_terminated_length": 157.28125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 2.5702101736216876, "grad_norm": 1.5117769241333008, "kl": 3.365234375, "learning_rate": 1.125671092984513e-05, "loss": 0.1331, "num_tokens": 38938676.0, "reward": -1.24462890625, "reward_std": 0.5985305905342102, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.2783203125, "rewards/ppl_reward/std": 3.3956267833709717, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 145.84375, "completions/mean_terminated_length": 145.84375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.571428571428571, "grad_norm": 2.449568748474121, "kl": 4.9638671875, "learning_rate": 1.1248264171919696e-05, "loss": 0.1789, "num_tokens": 38954770.0, "reward": -2.1053466796875, "reward_std": 0.7727399468421936, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.890380859375, "rewards/ppl_reward/std": 11.830643653869629, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 150.125, "completions/mean_terminated_length": 150.125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 2.572646969235455, "grad_norm": 2.1374919414520264, "kl": 5.197265625, "learning_rate": 1.1239816509192385e-05, "loss": 0.0604, "num_tokens": 38971402.0, "reward": -1.823486328125, "reward_std": 1.8122060298919678, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.10009765625, "rewards/ppl_reward/std": 5.606746673583984, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.23239074647426605, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 144.515625, "completions/mean_terminated_length": 144.515625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 2.5738653670423393, "grad_norm": 3.2565758228302, "kl": 4.3125, "learning_rate": 1.123136794778647e-05, "loss": 0.1202, "num_tokens": 38987523.0, "reward": -1.8792724609375, "reward_std": 0.6590291261672974, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.305419921875, "rewards/ppl_reward/std": 5.244633197784424, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19654127955436707, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 140.515625, "completions/mean_terminated_length": 140.515625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.5750837648492233, "grad_norm": 3.530477285385132, "kl": 5.701171875, "learning_rate": 1.1222918493825876e-05, "loss": 0.1555, "num_tokens": 39003820.0, "reward": -1.5806884765625, "reward_std": 0.7173137664794922, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.645751953125, "rewards/ppl_reward/std": 4.410815715789795, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2280818521976471, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.5763021626561073, "grad_norm": 2.5463919639587402, "kl": 3.064453125, "learning_rate": 1.1214468153435172e-05, "loss": 0.0749, "num_tokens": 39018612.0, "reward": -1.732177734375, "reward_std": 0.5664527416229248, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.23779296875, "rewards/ppl_reward/std": 5.400424957275391, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 144.6875, "completions/mean_terminated_length": 144.6875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 2.5775205604629914, "grad_norm": 1.2991020679473877, "kl": 4.7421875, "learning_rate": 1.1206016932739569e-05, "loss": 0.1654, "num_tokens": 39034264.0, "reward": -2.5374755859375, "reward_std": 1.3090934753417969, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.770263671875, "rewards/ppl_reward/std": 5.782548904418945, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 135.421875, "completions/mean_terminated_length": 135.421875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 2.578738958269875, "grad_norm": 4.109812259674072, "kl": 7.9921875, "learning_rate": 1.1197564837864923e-05, "loss": 0.2405, "num_tokens": 39049539.0, "reward": -2.7686767578125, "reward_std": 1.7593625783920288, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -8.943603515625, "rewards/ppl_reward/std": 7.29717493057251, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 130.703125, "completions/mean_terminated_length": 130.703125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 2.579957356076759, "grad_norm": 2.797473669052124, "kl": 5.37109375, "learning_rate": 1.1189111874937712e-05, "loss": 0.1485, "num_tokens": 39064888.0, "reward": -1.0552978515625, "reward_std": 1.2880172729492188, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -5.438720703125, "rewards/ppl_reward/std": 3.6999998092651367, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.27716949582099915, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.581175753883643, "grad_norm": 2.0377042293548584, "kl": 7.296875, "learning_rate": 1.1180658050085056e-05, "loss": 0.417, "num_tokens": 39082864.0, "reward": -1.0272216796875, "reward_std": 0.6023987531661987, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.671630859375, "rewards/ppl_reward/std": 2.852731943130493, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1489359587430954, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 138.140625, "completions/mean_terminated_length": 138.140625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.582394151690527, "grad_norm": 1.4193490743637085, "kl": 1.333984375, "learning_rate": 1.1172203369434692e-05, "loss": 0.0142, "num_tokens": 39098849.0, "reward": -2.5726318359375, "reward_std": 0.4452693462371826, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -9.051513671875, "rewards/ppl_reward/std": 5.068614482879639, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08768405020236969, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 137.484375, "completions/mean_terminated_length": 137.484375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 2.5836125494974107, "grad_norm": 1.5873571634292603, "kl": 3.65234375, "learning_rate": 1.1163747839114976e-05, "loss": 0.0455, "num_tokens": 39114784.0, "reward": -1.10498046875, "reward_std": 1.3515043258666992, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.7099609375, "rewards/ppl_reward/std": 3.3480420112609863, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.2539372742176056, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 151.578125, "completions/mean_terminated_length": 151.578125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 2.5848309473042947, "grad_norm": 1.3007339239120483, "kl": 1.30078125, "learning_rate": 1.1155291465254885e-05, "loss": 0.0283, "num_tokens": 39132845.0, "reward": -1.2408447265625, "reward_std": 0.41829124093055725, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.341064453125, "rewards/ppl_reward/std": 3.361614465713501, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 136.9375, "completions/mean_terminated_length": 136.9375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 2.586049345111179, "grad_norm": 1.834315299987793, "kl": 2.9375, "learning_rate": 1.1146834253984008e-05, "loss": 0.1042, "num_tokens": 39148465.0, "reward": -0.48309326171875, "reward_std": 0.8254390358924866, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.7943115234375, "rewards/ppl_reward/std": 3.8467249870300293, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 133.125, "completions/mean_terminated_length": 133.125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 2.587267742918063, "grad_norm": 2.0044469833374023, "kl": 3.87109375, "learning_rate": 1.1138376211432533e-05, "loss": 0.0612, "num_tokens": 39163129.0, "reward": -1.4713134765625, "reward_std": 0.8185783624649048, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.723876953125, "rewards/ppl_reward/std": 4.013893127441406, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 139.125, "completions/mean_terminated_length": 139.125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.588486140724947, "grad_norm": 1.5166736841201782, "kl": 5.5478515625, "learning_rate": 1.1129917343731259e-05, "loss": 0.2004, "num_tokens": 39179417.0, "reward": -1.0628662109375, "reward_std": 0.4962102174758911, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.781982421875, "rewards/ppl_reward/std": 3.910327672958374, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 130.09375, "completions/mean_terminated_length": 130.09375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 2.589704538531831, "grad_norm": 1.7544784545898438, "kl": 4.349609375, "learning_rate": 1.1121457657011576e-05, "loss": 0.1252, "num_tokens": 39194311.0, "reward": -2.702880859375, "reward_std": 2.1016712188720703, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -9.02294921875, "rewards/ppl_reward/std": 7.489646911621094, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.23249077796936035, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 141.078125, "completions/mean_terminated_length": 141.078125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 2.5909229363387145, "grad_norm": 2.011356830596924, "kl": 4.958984375, "learning_rate": 1.1112997157405478e-05, "loss": 0.1476, "num_tokens": 39211372.0, "reward": -1.1016845703125, "reward_std": 0.6449880003929138, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.765869140625, "rewards/ppl_reward/std": 2.3462347984313965, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 131.09375, "completions/mean_terminated_length": 131.09375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.5921413341455986, "grad_norm": 3.9474411010742188, "kl": 8.6953125, "learning_rate": 1.110453585104554e-05, "loss": 0.2728, "num_tokens": 39226826.0, "reward": -0.84906005859375, "reward_std": 0.747249960899353, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.1121826171875, "rewards/ppl_reward/std": 3.2612080574035645, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.20740120112895966, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 123.46875, "completions/mean_terminated_length": 123.46875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 2.5933597319524826, "grad_norm": 1.3788094520568848, "kl": 5.37890625, "learning_rate": 1.1096073744064918e-05, "loss": 0.1694, "num_tokens": 39241128.0, "reward": -1.6204833984375, "reward_std": 1.2562079429626465, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.842529296875, "rewards/ppl_reward/std": 4.158116817474365, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1876239776611328, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 127.015625, "completions/mean_terminated_length": 127.015625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.594578129759366, "grad_norm": 2.1181931495666504, "kl": 2.4375, "learning_rate": 1.1087610842597362e-05, "loss": 0.0482, "num_tokens": 39255985.0, "reward": -2.2489013671875, "reward_std": 1.5638911724090576, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.255615234375, "rewards/ppl_reward/std": 7.7015228271484375, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1550549566745758, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 127.875, "completions/mean_terminated_length": 127.875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.5957965275662502, "grad_norm": 1.655722737312317, "kl": 2.4521484375, "learning_rate": 1.1079147152777184e-05, "loss": 0.0617, "num_tokens": 39271265.0, "reward": -1.05859375, "reward_std": 0.4795834422111511, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.9765625, "rewards/ppl_reward/std": 3.3197968006134033, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 129.828125, "completions/mean_terminated_length": 129.828125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.5970149253731343, "grad_norm": 2.380650043487549, "kl": 6.11328125, "learning_rate": 1.1070682680739275e-05, "loss": 0.2803, "num_tokens": 39286534.0, "reward": -0.8525390625, "reward_std": 0.5570662021636963, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.330078125, "rewards/ppl_reward/std": 1.3554266691207886, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 127.921875, "completions/mean_terminated_length": 127.921875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.5982333231800183, "grad_norm": 1.7723592519760132, "kl": 1.955078125, "learning_rate": 1.1062217432619095e-05, "loss": 0.0724, "num_tokens": 39301241.0, "reward": -1.7940673828125, "reward_std": 0.6125337481498718, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.478759765625, "rewards/ppl_reward/std": 4.216468811035156, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 134.921875, "completions/mean_terminated_length": 134.921875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.5994517209869024, "grad_norm": 2.2010748386383057, "kl": 2.513671875, "learning_rate": 1.1053751414552654e-05, "loss": 0.1493, "num_tokens": 39317084.0, "reward": -4.4515380859375, "reward_std": 0.5504146814346313, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -12.832763671875, "rewards/ppl_reward/std": 19.318984985351562, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 139.859375, "completions/mean_terminated_length": 139.859375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.6006701187937864, "grad_norm": 2.2412073612213135, "kl": 2.6064453125, "learning_rate": 1.1045284632676535e-05, "loss": 0.0672, "num_tokens": 39334395.0, "reward": -0.9300537109375, "reward_std": 0.5848239660263062, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.617919921875, "rewards/ppl_reward/std": 5.015283107757568, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 128.015625, "completions/mean_terminated_length": 128.015625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.60188851660067, "grad_norm": 1.9728425741195679, "kl": 4.8828125, "learning_rate": 1.103681709312787e-05, "loss": 0.0975, "num_tokens": 39349284.0, "reward": -1.4583740234375, "reward_std": 0.9578179121017456, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.557373046875, "rewards/ppl_reward/std": 3.4072954654693604, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 113.515625, "completions/mean_terminated_length": 113.515625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.603106914407554, "grad_norm": 1.8351751565933228, "kl": 4.01953125, "learning_rate": 1.1028348802044336e-05, "loss": 0.0863, "num_tokens": 39363093.0, "reward": -2.670654296875, "reward_std": 0.8122847080230713, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.94287109375, "rewards/ppl_reward/std": 8.462868690490723, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16512493789196014, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 129.09375, "completions/mean_terminated_length": 129.09375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 2.604325312214438, "grad_norm": 1.9519507884979248, "kl": 3.58984375, "learning_rate": 1.1019879765564155e-05, "loss": 0.1121, "num_tokens": 39377915.0, "reward": -1.9405517578125, "reward_std": 0.513343334197998, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.724853515625, "rewards/ppl_reward/std": 6.255847454071045, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 141.6875, "completions/mean_terminated_length": 141.6875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.605543710021322, "grad_norm": 2.0284736156463623, "kl": 4.3515625, "learning_rate": 1.1011409989826094e-05, "loss": 0.1585, "num_tokens": 39393855.0, "reward": -0.673828125, "reward_std": 1.3420029878616333, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.01953125, "rewards/ppl_reward/std": 5.466305255889893, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1423913538455963, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 141.0625, "completions/mean_terminated_length": 141.0625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.6067621078282057, "grad_norm": 2.2627272605895996, "kl": 2.474609375, "learning_rate": 1.1002939480969457e-05, "loss": 0.0211, "num_tokens": 39410515.0, "reward": -2.0455322265625, "reward_std": 0.5447996258735657, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.880126953125, "rewards/ppl_reward/std": 6.599042892456055, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 135.234375, "completions/mean_terminated_length": 135.234375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.6079805056350898, "grad_norm": 1.4937036037445068, "kl": 2.96875, "learning_rate": 1.0994468245134071e-05, "loss": 0.1318, "num_tokens": 39425858.0, "reward": -2.46630859375, "reward_std": 0.4202907085418701, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -8.8076171875, "rewards/ppl_reward/std": 6.310473918914795, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 140.390625, "completions/mean_terminated_length": 140.390625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 2.609198903441974, "grad_norm": 2.178314685821533, "kl": 1.7197265625, "learning_rate": 1.09859962884603e-05, "loss": 0.0332, "num_tokens": 39441475.0, "reward": -1.16204833984375, "reward_std": 0.8597705364227295, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.1990966796875, "rewards/ppl_reward/std": 4.066897869110107, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 131.09375, "completions/mean_terminated_length": 131.09375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 2.610417301248858, "grad_norm": 2.6563923358917236, "kl": 5.51171875, "learning_rate": 1.0977523617089019e-05, "loss": 0.0844, "num_tokens": 39456273.0, "reward": -0.954833984375, "reward_std": 1.0488240718841553, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.35498046875, "rewards/ppl_reward/std": 3.6751084327697754, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2372427135705948, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 129.140625, "completions/mean_terminated_length": 129.140625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 2.611635699055742, "grad_norm": 1.855804204940796, "kl": 4.611328125, "learning_rate": 1.0969050237161632e-05, "loss": 0.1369, "num_tokens": 39471698.0, "reward": -0.69757080078125, "reward_std": 0.8060824871063232, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.9498291015625, "rewards/ppl_reward/std": 2.903202533721924, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21474508941173553, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 140.03125, "completions/mean_terminated_length": 140.03125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 2.6128540968626255, "grad_norm": 2.2439239025115967, "kl": 5.42578125, "learning_rate": 1.096057615482005e-05, "loss": 0.2713, "num_tokens": 39488468.0, "reward": -2.8697509765625, "reward_std": 2.407114267349243, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -9.348876953125, "rewards/ppl_reward/std": 13.716449737548828, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 132.40625, "completions/mean_terminated_length": 132.40625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 2.6140724946695095, "grad_norm": 2.324995517730713, "kl": 6.17578125, "learning_rate": 1.09521013762067e-05, "loss": 0.13, "num_tokens": 39503934.0, "reward": -1.5775146484375, "reward_std": 1.4465413093566895, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.514404296875, "rewards/ppl_reward/std": 3.4609012603759766, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.28510910272598267, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 133.328125, "completions/mean_terminated_length": 133.328125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 2.6152908924763936, "grad_norm": 2.9967150688171387, "kl": 5.013671875, "learning_rate": 1.0943625907464499e-05, "loss": 0.0908, "num_tokens": 39519899.0, "reward": -0.09710693359375, "reward_std": 0.7173599600791931, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -3.8817138671875, "rewards/ppl_reward/std": 1.8317965269088745, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.22271771728992462, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 133.8125, "completions/mean_terminated_length": 133.8125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 2.6165092902832776, "grad_norm": 3.1277544498443604, "kl": 5.91796875, "learning_rate": 1.0935149754736882e-05, "loss": 0.2136, "num_tokens": 39535335.0, "reward": -0.8046875, "reward_std": 0.6868209838867188, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.1875, "rewards/ppl_reward/std": 2.3619790077209473, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.14412261545658112, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 131.609375, "completions/mean_terminated_length": 131.609375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.617727688090161, "grad_norm": 2.030205488204956, "kl": 3.203125, "learning_rate": 1.0926672924167774e-05, "loss": 0.1106, "num_tokens": 39550342.0, "reward": -1.2630615234375, "reward_std": 0.904425323009491, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.205810546875, "rewards/ppl_reward/std": 5.352670192718506, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 135.859375, "completions/mean_terminated_length": 135.859375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.6189460858970453, "grad_norm": 2.1597063541412354, "kl": 5.24609375, "learning_rate": 1.0918195421901583e-05, "loss": 0.2336, "num_tokens": 39565565.0, "reward": -2.815673828125, "reward_std": 2.13258695602417, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -9.18603515625, "rewards/ppl_reward/std": 6.477847099304199, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19024936854839325, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 135.671875, "completions/mean_terminated_length": 135.671875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.6201644837039293, "grad_norm": 2.393012762069702, "kl": 6.99609375, "learning_rate": 1.0909717254083213e-05, "loss": 0.3456, "num_tokens": 39581232.0, "reward": -2.3291015625, "reward_std": 0.8530218005180359, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.283203125, "rewards/ppl_reward/std": 5.607891082763672, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.125, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 126.734375, "completions/mean_terminated_length": 126.734375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.6213828815108133, "grad_norm": 2.9447803497314453, "kl": 4.5556640625, "learning_rate": 1.0901238426858048e-05, "loss": 0.1317, "num_tokens": 39596951.0, "reward": -2.690673828125, "reward_std": 1.0697470903396606, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.91259765625, "rewards/ppl_reward/std": 5.184248447418213, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1836577206850052, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 143.875, "completions/mean_terminated_length": 143.875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 2.6226012793176974, "grad_norm": 1.977567195892334, "kl": 7.978515625, "learning_rate": 1.0892758946371943e-05, "loss": 0.455, "num_tokens": 39612887.0, "reward": -1.1556396484375, "reward_std": 1.4832762479782104, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -5.655029296875, "rewards/ppl_reward/std": 4.373610496520996, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.22493386268615723, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 142.984375, "completions/mean_terminated_length": 142.984375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 2.6238196771245814, "grad_norm": 2.896449565887451, "kl": 9.708984375, "learning_rate": 1.0884278818771244e-05, "loss": 0.5082, "num_tokens": 39628646.0, "reward": -1.8292236328125, "reward_std": 1.0195934772491455, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -7.088134765625, "rewards/ppl_reward/std": 4.833582878112793, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2138771414756775, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 132.65625, "completions/mean_terminated_length": 132.65625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.625038074931465, "grad_norm": 1.9976485967636108, "kl": 4.1953125, "learning_rate": 1.087579805020275e-05, "loss": 0.141, "num_tokens": 39643752.0, "reward": -2.421875, "reward_std": 1.6607760190963745, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.4453125, "rewards/ppl_reward/std": 9.080451965332031, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.2028672844171524, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 124.875, "completions/mean_terminated_length": 124.875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 2.626256472738349, "grad_norm": 1.8959547281265259, "kl": 3.15625, "learning_rate": 1.086731664681373e-05, "loss": 0.0332, "num_tokens": 39658712.0, "reward": -3.371337890625, "reward_std": 1.0349878072738647, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -10.34423828125, "rewards/ppl_reward/std": 14.038958549499512, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 127.4375, "completions/mean_terminated_length": 127.4375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 2.627474870545233, "grad_norm": 2.437776565551758, "kl": 2.779296875, "learning_rate": 1.085883461475191e-05, "loss": 0.0651, "num_tokens": 39674148.0, "reward": -0.8587646484375, "reward_std": 0.7688645124435425, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.412841796875, "rewards/ppl_reward/std": 3.1458897590637207, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 118.625, "completions/mean_terminated_length": 118.625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.6286932683521167, "grad_norm": 2.2533116340637207, "kl": 4.546875, "learning_rate": 1.085035196016548e-05, "loss": 0.1533, "num_tokens": 39688212.0, "reward": -0.906982421875, "reward_std": 0.6944828629493713, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.41552734375, "rewards/ppl_reward/std": 3.532935857772827, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 128.25, "completions/mean_terminated_length": 128.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.6299116661590007, "grad_norm": 1.7803714275360107, "kl": 3.05859375, "learning_rate": 1.0841868689203072e-05, "loss": 0.0661, "num_tokens": 39703716.0, "reward": -1.1717529296875, "reward_std": 0.4805957078933716, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.210693359375, "rewards/ppl_reward/std": 5.858547687530518, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 121.546875, "completions/mean_terminated_length": 121.546875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.631130063965885, "grad_norm": 2.2561123371124268, "kl": 4.01171875, "learning_rate": 1.0833384808013768e-05, "loss": 0.1831, "num_tokens": 39718119.0, "reward": -0.22650146484375, "reward_std": 0.45723849534988403, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.1795654296875, "rewards/ppl_reward/std": 1.528807520866394, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 127.140625, "completions/mean_terminated_length": 127.140625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.632348461772769, "grad_norm": 3.4428415298461914, "kl": 6.56640625, "learning_rate": 1.0824900322747094e-05, "loss": 0.2737, "num_tokens": 39733448.0, "reward": -1.549560546875, "reward_std": 0.811095118522644, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -6.44287109375, "rewards/ppl_reward/std": 4.20937967300415, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.20653989911079407, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 127.0, "completions/mean_terminated_length": 127.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.633566859579653, "grad_norm": 1.7017093896865845, "kl": 3.1298828125, "learning_rate": 1.0816415239553014e-05, "loss": 0.0951, "num_tokens": 39748408.0, "reward": -2.168701171875, "reward_std": 0.38057684898376465, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.19677734375, "rewards/ppl_reward/std": 4.3641133308410645, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 120.21875, "completions/mean_terminated_length": 120.21875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.634785257386537, "grad_norm": 1.7783750295639038, "kl": 3.19921875, "learning_rate": 1.0807929564581925e-05, "loss": 0.1111, "num_tokens": 39762598.0, "reward": -3.6533203125, "reward_std": 4.222900390625, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -10.978515625, "rewards/ppl_reward/std": 26.62481117248535, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 121.21875, "completions/mean_terminated_length": 121.21875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.6360036551934205, "grad_norm": 2.311161518096924, "kl": 6.6484375, "learning_rate": 1.0799443303984649e-05, "loss": 0.2191, "num_tokens": 39776948.0, "reward": -1.87451171875, "reward_std": 1.9066555500030518, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.1787109375, "rewards/ppl_reward/std": 4.638349533081055, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.23408547043800354, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 127.25, "completions/mean_terminated_length": 127.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 2.6372220530003045, "grad_norm": 4.085903167724609, "kl": 8.4609375, "learning_rate": 1.0790956463912439e-05, "loss": 0.3465, "num_tokens": 39791588.0, "reward": -2.663330078125, "reward_std": 1.4633392095565796, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -8.68603515625, "rewards/ppl_reward/std": 6.002346038818359, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.16347388923168182, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 125.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.6384404508071886, "grad_norm": 2.8531808853149414, "kl": 7.46875, "learning_rate": 1.078246905051696e-05, "loss": 0.3313, "num_tokens": 39807132.0, "reward": -1.522705078125, "reward_std": 0.8068622350692749, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.50634765625, "rewards/ppl_reward/std": 3.862929344177246, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.15585267543792725, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 130.96875, "completions/mean_terminated_length": 130.96875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.6396588486140726, "grad_norm": 1.9261062145233154, "kl": 3.63671875, "learning_rate": 1.0773981069950298e-05, "loss": 0.1304, "num_tokens": 39823394.0, "reward": -1.175048828125, "reward_std": 0.46626389026641846, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.04541015625, "rewards/ppl_reward/std": 1.3349475860595703, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 127.625, "completions/mean_terminated_length": 127.625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.6408772464209562, "grad_norm": 1.8189654350280762, "kl": 5.04296875, "learning_rate": 1.076549252836496e-05, "loss": 0.1591, "num_tokens": 39838474.0, "reward": -1.28350830078125, "reward_std": 0.837858259677887, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.1529541015625, "rewards/ppl_reward/std": 3.930150270462036, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.14211894571781158, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 128.109375, "completions/mean_terminated_length": 128.109375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.6420956442278403, "grad_norm": 2.363102674484253, "kl": 4.1328125, "learning_rate": 1.0757003431913833e-05, "loss": 0.1806, "num_tokens": 39853393.0, "reward": -0.361572265625, "reward_std": 0.3753918409347534, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.55908203125, "rewards/ppl_reward/std": 2.048436164855957, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 133.0625, "completions/mean_terminated_length": 133.0625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 2.6433140420347243, "grad_norm": 3.725149154663086, "kl": 6.6796875, "learning_rate": 1.074851378675023e-05, "loss": 0.3231, "num_tokens": 39869413.0, "reward": -1.4375, "reward_std": 0.8047508001327515, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.5625, "rewards/ppl_reward/std": 4.16146183013916, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 153.5, "completions/mean_terminated_length": 153.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 2.6445324398416084, "grad_norm": 2.0628678798675537, "kl": 8.05859375, "learning_rate": 1.0740023599027855e-05, "loss": 0.3848, "num_tokens": 39888261.0, "reward": -1.76513671875, "reward_std": 1.2797726392745972, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.8974609375, "rewards/ppl_reward/std": 3.308941602706909, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.25946253538131714, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 137.8125, "completions/mean_terminated_length": 137.8125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.6457508376484924, "grad_norm": 3.505431890487671, "kl": 8.1875, "learning_rate": 1.0731532874900804e-05, "loss": 0.342, "num_tokens": 39904369.0, "reward": -2.740966796875, "reward_std": 0.9837250709533691, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -8.96630859375, "rewards/ppl_reward/std": 6.232511043548584, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.13706642389297485, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 135.921875, "completions/mean_terminated_length": 135.921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.6469692354553764, "grad_norm": 1.9087729454040527, "kl": 4.791015625, "learning_rate": 1.0723041620523558e-05, "loss": 0.1786, "num_tokens": 39920372.0, "reward": -2.075927734375, "reward_std": 0.9902807474136353, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.76123046875, "rewards/ppl_reward/std": 7.717649459838867, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.2203386276960373, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 126.953125, "completions/mean_terminated_length": 126.953125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 2.64818763326226, "grad_norm": 3.4735443592071533, "kl": 8.265625, "learning_rate": 1.0714549842050987e-05, "loss": 0.2384, "num_tokens": 39935521.0, "reward": -0.97607421875, "reward_std": 1.324784755706787, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -5.2490234375, "rewards/ppl_reward/std": 2.6812305450439453, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2302463799715042, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 148.53125, "completions/mean_terminated_length": 148.53125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.649406031069144, "grad_norm": 4.097495079040527, "kl": 7.9384765625, "learning_rate": 1.0706057545638346e-05, "loss": 0.3853, "num_tokens": 39952419.0, "reward": -0.4666748046875, "reward_std": 0.5278956890106201, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.550537109375, "rewards/ppl_reward/std": 1.433133602142334, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2052978277206421, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 132.078125, "completions/mean_terminated_length": 132.078125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 2.650624428876028, "grad_norm": 2.123769521713257, "kl": 4.763671875, "learning_rate": 1.0697564737441254e-05, "loss": 0.1478, "num_tokens": 39967904.0, "reward": -0.97589111328125, "reward_std": 0.7680187225341797, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.5142822265625, "rewards/ppl_reward/std": 3.3627099990844727, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1836577206850052, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 142.4375, "completions/mean_terminated_length": 142.4375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.6518428266829117, "grad_norm": 2.3820128440856934, "kl": 4.41796875, "learning_rate": 1.0689071423615708e-05, "loss": 0.3324, "num_tokens": 39983676.0, "reward": -2.139892578125, "reward_std": 0.4724240303039551, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.10009765625, "rewards/ppl_reward/std": 6.536725044250488, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 126.546875, "completions/mean_terminated_length": 126.546875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.6530612244897958, "grad_norm": 2.673809289932251, "kl": 4.306640625, "learning_rate": 1.0680577610318073e-05, "loss": 0.2021, "num_tokens": 39998295.0, "reward": -0.849609375, "reward_std": 0.42652419209480286, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.39453125, "rewards/ppl_reward/std": 2.400545358657837, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 122.203125, "completions/mean_terminated_length": 122.203125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.65427962229668, "grad_norm": 1.6263694763183594, "kl": 4.66796875, "learning_rate": 1.0672083303705067e-05, "loss": 0.1682, "num_tokens": 40012732.0, "reward": -2.62908935546875, "reward_std": 1.5301120281219482, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.8675537109375, "rewards/ppl_reward/std": 6.56544303894043, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1110801100730896, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 130.234375, "completions/mean_terminated_length": 130.234375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.655498020103564, "grad_norm": 2.0331952571868896, "kl": 4.33203125, "learning_rate": 1.0663588509933778e-05, "loss": 0.1459, "num_tokens": 40027779.0, "reward": -0.5755615234375, "reward_std": 0.45993632078170776, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.815185546875, "rewards/ppl_reward/std": 1.4427945613861084, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 140.125, "completions/mean_terminated_length": 140.125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 2.656716417910448, "grad_norm": 1.677159070968628, "kl": 3.859375, "learning_rate": 1.065509323516164e-05, "loss": 0.1484, "num_tokens": 40043939.0, "reward": -0.7213134765625, "reward_std": 0.6159806251525879, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.059814453125, "rewards/ppl_reward/std": 1.8838201761245728, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10789459943771362, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 150.953125, "completions/mean_terminated_length": 150.953125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.657934815717332, "grad_norm": 1.88153076171875, "kl": 5.1875, "learning_rate": 1.0646597485546432e-05, "loss": 0.2415, "num_tokens": 40061768.0, "reward": -0.8419189453125, "reward_std": 0.524622917175293, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.363525390625, "rewards/ppl_reward/std": 3.423581123352051, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11672777682542801, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 156.5625, "completions/mean_terminated_length": 156.5625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.6591532135242155, "grad_norm": 2.1203272342681885, "kl": 7.015625, "learning_rate": 1.0638101267246283e-05, "loss": 0.2405, "num_tokens": 40080636.0, "reward": -2.486572265625, "reward_std": 2.291980504989624, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -8.33251953125, "rewards/ppl_reward/std": 10.008965492248535, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2236899733543396, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 129.796875, "completions/mean_terminated_length": 129.796875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 2.6603716113310996, "grad_norm": 1.8352811336517334, "kl": 3.6875, "learning_rate": 1.0629604586419666e-05, "loss": 0.0627, "num_tokens": 40095415.0, "reward": -1.17645263671875, "reward_std": 0.8673622608184814, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.9857177734375, "rewards/ppl_reward/std": 4.114120006561279, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 126.8125, "completions/mean_terminated_length": 126.8125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.6615900091379836, "grad_norm": 3.199890375137329, "kl": 3.48046875, "learning_rate": 1.0621107449225379e-05, "loss": 0.1594, "num_tokens": 40110219.0, "reward": -0.904296875, "reward_std": 0.9795452356338501, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.49609375, "rewards/ppl_reward/std": 3.2186086177825928, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09449111670255661, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 134.140625, "completions/mean_terminated_length": 134.140625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.6628084069448676, "grad_norm": 1.8896749019622803, "kl": 6.73046875, "learning_rate": 1.0612609861822556e-05, "loss": 0.3285, "num_tokens": 40125836.0, "reward": -0.9652099609375, "reward_std": 0.7227961421012878, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.485107421875, "rewards/ppl_reward/std": 4.0951247215271, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.14211894571781158, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 121.40625, "completions/mean_terminated_length": 121.40625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 2.6640268047517512, "grad_norm": 3.4281792640686035, "kl": 7.57421875, "learning_rate": 1.0604111830370656e-05, "loss": 0.2923, "num_tokens": 40139934.0, "reward": -2.587646484375, "reward_std": 2.006659984588623, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.4732423722743988, "rewards/ppl_reward/mean": -8.22998046875, "rewards/ppl_reward/std": 7.61926794052124, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.2627868354320526, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 135.5625, "completions/mean_terminated_length": 135.5625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.6652452025586353, "grad_norm": 2.0677411556243896, "kl": 5.078125, "learning_rate": 1.0595613361029462e-05, "loss": 0.2092, "num_tokens": 40156186.0, "reward": -2.0770263671875, "reward_std": 0.975111722946167, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.685302734375, "rewards/ppl_reward/std": 4.482025146484375, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 138.265625, "completions/mean_terminated_length": 138.265625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.6664636003655193, "grad_norm": 2.0059726238250732, "kl": 5.75, "learning_rate": 1.058711445995907e-05, "loss": 0.2108, "num_tokens": 40172235.0, "reward": -1.857177734375, "reward_std": 0.903321385383606, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -7.17529296875, "rewards/ppl_reward/std": 6.309180736541748, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.14255455136299133, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 138.90625, "completions/mean_terminated_length": 138.90625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 2.6676819981724034, "grad_norm": 2.4425065517425537, "kl": 7.64453125, "learning_rate": 1.0578615133319901e-05, "loss": 0.3547, "num_tokens": 40188821.0, "reward": -7.305419921875, "reward_std": 3.392451286315918, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -17.86865234375, "rewards/ppl_reward/std": 27.872514724731445, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2597014009952545, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 131.46875, "completions/mean_terminated_length": 131.46875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.6689003959792874, "grad_norm": 1.5861189365386963, "kl": 4.060546875, "learning_rate": 1.0570115387272666e-05, "loss": 0.1512, "num_tokens": 40204075.0, "reward": -1.47210693359375, "reward_std": 0.6458339095115662, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.3426513671875, "rewards/ppl_reward/std": 6.6904754638671875, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.1994769275188446, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 144.59375, "completions/mean_terminated_length": 144.59375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 2.6701187937861715, "grad_norm": 2.639522075653076, "kl": 4.2578125, "learning_rate": 1.0561615227978393e-05, "loss": 0.2276, "num_tokens": 40221777.0, "reward": -1.9364013671875, "reward_std": 0.5802242159843445, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.638427734375, "rewards/ppl_reward/std": 5.274082183837891, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 120.875, "completions/mean_terminated_length": 120.875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.671337191593055, "grad_norm": 11.125504493713379, "kl": 3.25390625, "learning_rate": 1.0553114661598406e-05, "loss": 0.0764, "num_tokens": 40236313.0, "reward": -2.239990234375, "reward_std": 0.7605612874031067, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.08935546875, "rewards/ppl_reward/std": 7.615994930267334, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18617255985736847, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 127.84375, "completions/mean_terminated_length": 127.84375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.672555589399939, "grad_norm": 2.5152504444122314, "kl": 5.140625, "learning_rate": 1.0544613694294326e-05, "loss": 0.1927, "num_tokens": 40250951.0, "reward": -1.153076171875, "reward_std": 1.1039855480194092, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.80615234375, "rewards/ppl_reward/std": 3.731013774871826, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1836577206850052, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 126.796875, "completions/mean_terminated_length": 126.796875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.673773987206823, "grad_norm": 2.4098405838012695, "kl": 2.755859375, "learning_rate": 1.0536112332228057e-05, "loss": 0.0741, "num_tokens": 40265570.0, "reward": -2.393798828125, "reward_std": 0.918707549571991, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.62353515625, "rewards/ppl_reward/std": 6.851502418518066, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 141.140625, "completions/mean_terminated_length": 141.140625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.6749923850137067, "grad_norm": 2.230768918991089, "kl": 7.13671875, "learning_rate": 1.05276105815618e-05, "loss": 0.2991, "num_tokens": 40282371.0, "reward": -0.601806640625, "reward_std": 0.6483488082885742, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -4.68017578125, "rewards/ppl_reward/std": 2.156886100769043, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11404092609882355, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 131.765625, "completions/mean_terminated_length": 131.765625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 2.6762107828205908, "grad_norm": 1.8178333044052124, "kl": 4.41796875, "learning_rate": 1.0519108448458034e-05, "loss": 0.1578, "num_tokens": 40297956.0, "reward": -1.431884765625, "reward_std": 0.9549664855003357, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.37939453125, "rewards/ppl_reward/std": 4.569881439208984, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21578919887542725, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 115.625, "completions/mean_terminated_length": 115.625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 2.677429180627475, "grad_norm": 3.631380081176758, "kl": 4.21484375, "learning_rate": 1.0510605939079505e-05, "loss": 0.1221, "num_tokens": 40312044.0, "reward": -1.20263671875, "reward_std": 0.8459118008613586, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.1240234375, "rewards/ppl_reward/std": 3.339132785797119, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 126.546875, "completions/mean_terminated_length": 126.546875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.678647578434359, "grad_norm": 2.1731464862823486, "kl": 2.962890625, "learning_rate": 1.0502103059589253e-05, "loss": 0.0895, "num_tokens": 40327319.0, "reward": -0.5521240234375, "reward_std": 0.8642618060112, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.955810546875, "rewards/ppl_reward/std": 3.9577531814575195, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 136.59375, "completions/mean_terminated_length": 136.59375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.679865976241243, "grad_norm": 1.675986409187317, "kl": 3.58984375, "learning_rate": 1.0493599816150566e-05, "loss": 0.133, "num_tokens": 40344325.0, "reward": -0.79241943359375, "reward_std": 0.46443742513656616, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.3035888671875, "rewards/ppl_reward/std": 3.3928327560424805, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 124.28125, "completions/mean_terminated_length": 124.28125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.681084374048127, "grad_norm": 2.583369255065918, "kl": 2.5146484375, "learning_rate": 1.0485096214927002e-05, "loss": 0.0725, "num_tokens": 40359231.0, "reward": -3.127197265625, "reward_std": 0.8592029213905334, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -10.06689453125, "rewards/ppl_reward/std": 5.8075761795043945, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 127.0, "completions/mean_terminated_length": 127.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 2.6823027718550105, "grad_norm": 2.16009259223938, "kl": 4.4873046875, "learning_rate": 1.0476592262082384e-05, "loss": 0.1822, "num_tokens": 40374575.0, "reward": -1.32666015625, "reward_std": 1.2232201099395752, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.3720703125, "rewards/ppl_reward/std": 3.2250587940216064, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17747680842876434, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 127.390625, "completions/mean_terminated_length": 127.390625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.6835211696618946, "grad_norm": 2.966447353363037, "kl": 7.609375, "learning_rate": 1.046808796378079e-05, "loss": 0.2463, "num_tokens": 40389384.0, "reward": -1.5233154296875, "reward_std": 1.2357544898986816, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.445068359375, "rewards/ppl_reward/std": 4.444797515869141, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.24241341650485992, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 122.796875, "completions/mean_terminated_length": 122.796875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 2.6847395674687786, "grad_norm": 1.6719578504562378, "kl": 2.896484375, "learning_rate": 1.0459583326186532e-05, "loss": 0.0656, "num_tokens": 40404171.0, "reward": -1.2398681640625, "reward_std": 0.37586072087287903, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.346923828125, "rewards/ppl_reward/std": 3.8216516971588135, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 127.359375, "completions/mean_terminated_length": 127.359375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.6859579652756627, "grad_norm": 1.798334002494812, "kl": 1.6494140625, "learning_rate": 1.0451078355464189e-05, "loss": 0.0156, "num_tokens": 40419514.0, "reward": -0.255126953125, "reward_std": 0.1804785579442978, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.43994140625, "rewards/ppl_reward/std": 1.8775676488876343, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 118.953125, "completions/mean_terminated_length": 118.953125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.6871763630825463, "grad_norm": 2.3551669120788574, "kl": 3.79296875, "learning_rate": 1.044257305777857e-05, "loss": 0.0655, "num_tokens": 40433727.0, "reward": -0.70794677734375, "reward_std": 0.829072117805481, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.0643310546875, "rewards/ppl_reward/std": 3.2665793895721436, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17354662716388702, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 128.515625, "completions/mean_terminated_length": 128.515625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.6883947608894303, "grad_norm": 2.6374058723449707, "kl": 4.556640625, "learning_rate": 1.0434067439294726e-05, "loss": 0.1644, "num_tokens": 40448952.0, "reward": -1.8338623046875, "reward_std": 0.7358677387237549, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.386474609375, "rewards/ppl_reward/std": 6.767747402191162, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 135.5625, "completions/mean_terminated_length": 121.4603271484375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.6896131586963143, "grad_norm": 2.8003275394439697, "kl": 9.9140625, "learning_rate": 1.0425561506177937e-05, "loss": 0.5289, "num_tokens": 40464748.0, "reward": -2.17333984375, "reward_std": 0.964942455291748, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.9873046875, "rewards/ppl_reward/std": 6.543064117431641, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 128.765625, "completions/mean_terminated_length": 128.765625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 2.6908315565031984, "grad_norm": 1.9328100681304932, "kl": 3.2890625, "learning_rate": 1.0417055264593712e-05, "loss": 0.0958, "num_tokens": 40479509.0, "reward": -0.93359375, "reward_std": 0.28738465905189514, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.75, "rewards/ppl_reward/std": 2.3066275119781494, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 128.046875, "completions/mean_terminated_length": 128.046875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.6920499543100824, "grad_norm": 1.7305090427398682, "kl": 4.451171875, "learning_rate": 1.0408548720707778e-05, "loss": 0.1014, "num_tokens": 40493880.0, "reward": -2.9296875, "reward_std": 1.166972279548645, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -9.390625, "rewards/ppl_reward/std": 12.792686462402344, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 123.234375, "completions/mean_terminated_length": 123.234375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 2.693268352116966, "grad_norm": 1.942903757095337, "kl": 6.8984375, "learning_rate": 1.0400041880686096e-05, "loss": 0.2437, "num_tokens": 40508039.0, "reward": -2.42938232421875, "reward_std": 1.4411808252334595, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.4212646484375, "rewards/ppl_reward/std": 5.729386806488037, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.20412415266036987, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 147.953125, "completions/mean_terminated_length": 134.04762268066406, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.69448674992385, "grad_norm": 1.9009050130844116, "kl": 8.23046875, "learning_rate": 1.039153475069483e-05, "loss": 0.5527, "num_tokens": 40524644.0, "reward": -0.7025146484375, "reward_std": 0.9077008962631226, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.139404296875, "rewards/ppl_reward/std": 3.006016492843628, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.21578919887542725, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 125.234375, "completions/mean_terminated_length": 125.234375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.695705147730734, "grad_norm": 2.021836280822754, "kl": 4.541015625, "learning_rate": 1.0383027336900356e-05, "loss": 0.161, "num_tokens": 40539051.0, "reward": -1.0482177734375, "reward_std": 0.5180156230926514, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.713623046875, "rewards/ppl_reward/std": 2.657961130142212, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1489359587430954, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 153.828125, "completions/mean_terminated_length": 153.828125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.696923545537618, "grad_norm": 2.042893648147583, "kl": 8.15234375, "learning_rate": 1.0374519645469254e-05, "loss": 0.4618, "num_tokens": 40555896.0, "reward": -1.8973388671875, "reward_std": 1.0742437839508057, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.411865234375, "rewards/ppl_reward/std": 3.748270034790039, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 139.328125, "completions/mean_terminated_length": 139.328125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.6981419433445017, "grad_norm": 1.998473048210144, "kl": 4.783203125, "learning_rate": 1.036601168256831e-05, "loss": 0.1549, "num_tokens": 40571821.0, "reward": -3.1026611328125, "reward_std": 1.266150951385498, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -9.869384765625, "rewards/ppl_reward/std": 7.579331874847412, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.21704266965389252, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 137.828125, "completions/mean_terminated_length": 137.828125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.699360341151386, "grad_norm": 1.8447716236114502, "kl": 5.390625, "learning_rate": 1.0357503454364504e-05, "loss": 0.2861, "num_tokens": 40587346.0, "reward": -1.800537109375, "reward_std": 0.668520987033844, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.23388671875, "rewards/ppl_reward/std": 3.963744640350342, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11404092609882355, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 131.109375, "completions/mean_terminated_length": 131.109375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 2.70057873895827, "grad_norm": 1.906104326248169, "kl": 3.986328125, "learning_rate": 1.0348994967025012e-05, "loss": 0.121, "num_tokens": 40602129.0, "reward": -3.1944580078125, "reward_std": 1.1121292114257812, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -10.092041015625, "rewards/ppl_reward/std": 8.942378997802734, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.20638974010944366, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 123.296875, "completions/mean_terminated_length": 123.296875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.701797136765154, "grad_norm": 1.5077930688858032, "kl": 3.5, "learning_rate": 1.0340486226717188e-05, "loss": 0.0391, "num_tokens": 40616340.0, "reward": -3.07958984375, "reward_std": 1.1217286586761475, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -9.8935546875, "rewards/ppl_reward/std": 10.348403930664062, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07344620674848557, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 140.953125, "completions/mean_terminated_length": 140.953125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.703015534572038, "grad_norm": 2.239443063735962, "kl": 8.4765625, "learning_rate": 1.0331977239608584e-05, "loss": 0.3993, "num_tokens": 40631801.0, "reward": -0.9412841796875, "reward_std": 1.0708786249160767, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.406005859375, "rewards/ppl_reward/std": 2.9080915451049805, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1677328199148178, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 127.609375, "completions/mean_terminated_length": 127.609375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 2.704233932378922, "grad_norm": 1.6363427639007568, "kl": 2.30859375, "learning_rate": 1.0323468011866912e-05, "loss": 0.0215, "num_tokens": 40645792.0, "reward": -2.3594970703125, "reward_std": 0.5304503440856934, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.437744140625, "rewards/ppl_reward/std": 4.368327617645264, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 139.46875, "completions/mean_terminated_length": 139.46875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 2.7054523301858056, "grad_norm": 1.7460006475448608, "kl": 5.271484375, "learning_rate": 1.0314958549660083e-05, "loss": 0.1461, "num_tokens": 40661990.0, "reward": -0.665771484375, "reward_std": 0.6979612112045288, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.95654296875, "rewards/ppl_reward/std": 2.5801279544830322, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.20412415266036987, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 157.09375, "completions/mean_terminated_length": 157.09375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 2.7066707279926896, "grad_norm": 1.4255329370498657, "kl": 3.048828125, "learning_rate": 1.0306448859156155e-05, "loss": 0.0816, "num_tokens": 40679820.0, "reward": -1.803466796875, "reward_std": 0.4398045539855957, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.42724609375, "rewards/ppl_reward/std": 3.4046218395233154, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1269075721502304, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 141.625, "completions/mean_terminated_length": 141.625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.7078891257995736, "grad_norm": 2.2645151615142822, "kl": 6.546875, "learning_rate": 1.0297938946523361e-05, "loss": 0.2474, "num_tokens": 40695300.0, "reward": -0.8843994140625, "reward_std": 1.297924280166626, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.268798828125, "rewards/ppl_reward/std": 4.15492057800293, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20351573824882507, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 131.828125, "completions/mean_terminated_length": 131.828125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.7091075236064572, "grad_norm": 3.1995134353637695, "kl": 5.9609375, "learning_rate": 1.0289428817930096e-05, "loss": 0.1992, "num_tokens": 40710041.0, "reward": -1.1298828125, "reward_std": 0.8793550729751587, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.775390625, "rewards/ppl_reward/std": 2.2597334384918213, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.23239074647426605, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 148.3125, "completions/mean_terminated_length": 148.3125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.7103259214133413, "grad_norm": 1.8661874532699585, "kl": 5.27734375, "learning_rate": 1.0280918479544914e-05, "loss": 0.2163, "num_tokens": 40726573.0, "reward": -1.332275390625, "reward_std": 0.6203274726867676, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.28955078125, "rewards/ppl_reward/std": 3.572110891342163, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 147.703125, "completions/mean_terminated_length": 147.703125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.7115443192202253, "grad_norm": 4.250191688537598, "kl": 7.640625, "learning_rate": 1.0272407937536514e-05, "loss": 0.2032, "num_tokens": 40742858.0, "reward": -1.4820556640625, "reward_std": 1.7773613929748535, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.378173828125, "rewards/ppl_reward/std": 4.016290664672852, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.270231693983078, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 153.921875, "completions/mean_terminated_length": 153.921875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.7127627170271094, "grad_norm": 2.2223432064056396, "kl": 2.33984375, "learning_rate": 1.0263897198073744e-05, "loss": 0.0206, "num_tokens": 40759581.0, "reward": -0.32940673828125, "reward_std": 0.5004357099533081, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.4556884765625, "rewards/ppl_reward/std": 1.4768129587173462, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.17938801646232605, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 169.546875, "completions/mean_terminated_length": 169.546875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.7139811148339934, "grad_norm": 6.489713668823242, "kl": 15.0703125, "learning_rate": 1.0255386267325602e-05, "loss": 0.7251, "num_tokens": 40778016.0, "reward": -1.326904296875, "reward_std": 1.0178245306015015, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.4732423722743988, "rewards/ppl_reward/mean": -5.73193359375, "rewards/ppl_reward/std": 3.376368761062622, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.26714521646499634, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 152.75, "completions/mean_terminated_length": 152.75, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 2.7151995126408774, "grad_norm": 2.1091935634613037, "kl": 7.2109375, "learning_rate": 1.0246875151461212e-05, "loss": 0.4552, "num_tokens": 40794160.0, "reward": -2.180908203125, "reward_std": 1.385392189025879, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.02587890625, "rewards/ppl_reward/std": 5.286867618560791, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1876239776611328, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 135.4375, "completions/mean_terminated_length": 135.4375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 2.716417910447761, "grad_norm": 2.250917434692383, "kl": 4.154296875, "learning_rate": 1.0238363856649852e-05, "loss": 0.1102, "num_tokens": 40809572.0, "reward": -0.322265625, "reward_std": 0.7174479961395264, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.32421875, "rewards/ppl_reward/std": 1.6084851026535034, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17354662716388702, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 133.78125, "completions/mean_terminated_length": 133.78125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 2.717636308254645, "grad_norm": 1.800102949142456, "kl": 3.162109375, "learning_rate": 1.0229852389060905e-05, "loss": -0.0237, "num_tokens": 40824590.0, "reward": -1.5091552734375, "reward_std": 0.6779564619064331, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.705810546875, "rewards/ppl_reward/std": 5.336026668548584, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 141.390625, "completions/mean_terminated_length": 141.390625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 2.718854706061529, "grad_norm": 1.621340274810791, "kl": 4.640625, "learning_rate": 1.02213407548639e-05, "loss": 0.0705, "num_tokens": 40840439.0, "reward": -4.6705322265625, "reward_std": 3.253354787826538, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -12.747314453125, "rewards/ppl_reward/std": 22.618640899658203, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.26679685711860657, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 149.484375, "completions/mean_terminated_length": 149.484375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.720073103868413, "grad_norm": 1.6747206449508667, "kl": 5.353515625, "learning_rate": 1.0212828960228475e-05, "loss": 0.2259, "num_tokens": 40857102.0, "reward": -0.62353515625, "reward_std": 0.5893455743789673, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.9814453125, "rewards/ppl_reward/std": 1.881144642829895, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16194961965084076, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 171.53125, "completions/mean_terminated_length": 158.00001525878906, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.7212915016752968, "grad_norm": 1.8009693622589111, "kl": 6.94140625, "learning_rate": 1.0204317011324391e-05, "loss": 0.3985, "num_tokens": 40875872.0, "reward": -0.8497314453125, "reward_std": 0.5329646468162537, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.418212890625, "rewards/ppl_reward/std": 1.6340364217758179, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 153.453125, "completions/mean_terminated_length": 153.453125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 2.722509899482181, "grad_norm": 2.116434097290039, "kl": 5.853515625, "learning_rate": 1.0195804914321515e-05, "loss": 0.3965, "num_tokens": 40892429.0, "reward": -1.06591796875, "reward_std": 0.7430675029754639, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.8740234375, "rewards/ppl_reward/std": 3.914731025695801, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 152.234375, "completions/mean_terminated_length": 152.234375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.723728297289065, "grad_norm": 1.8788326978683472, "kl": 5.3046875, "learning_rate": 1.0187292675389821e-05, "loss": 0.2355, "num_tokens": 40909140.0, "reward": -1.338134765625, "reward_std": 0.8581675291061401, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.27783203125, "rewards/ppl_reward/std": 2.94893217086792, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1876239776611328, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 159.734375, "completions/mean_terminated_length": 159.734375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 2.724946695095949, "grad_norm": 1.7338507175445557, "kl": 6.1484375, "learning_rate": 1.0178780300699395e-05, "loss": 0.3199, "num_tokens": 40926307.0, "reward": -1.4500732421875, "reward_std": 0.9344358444213867, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.486083984375, "rewards/ppl_reward/std": 4.877993106842041, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1791718602180481, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 142.0625, "completions/mean_terminated_length": 142.0625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.726165092902833, "grad_norm": 2.2598466873168945, "kl": 1.7705078125, "learning_rate": 1.0170267796420416e-05, "loss": -0.0321, "num_tokens": 40942007.0, "reward": -2.484619140625, "reward_std": 0.44848597049713135, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -8.85986328125, "rewards/ppl_reward/std": 8.139641761779785, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 157.765625, "completions/mean_terminated_length": 157.765625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 2.727383490709717, "grad_norm": 1.6609842777252197, "kl": 5.984375, "learning_rate": 1.0161755168723153e-05, "loss": 0.2924, "num_tokens": 40959064.0, "reward": -1.671142578125, "reward_std": 2.5032193660736084, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.92822265625, "rewards/ppl_reward/std": 8.32214641571045, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 2.7286018885166006, "grad_norm": 1.951500654220581, "kl": 3.2685546875, "learning_rate": 1.0153242423777964e-05, "loss": 0.0924, "num_tokens": 40975472.0, "reward": -0.95068359375, "reward_std": 0.7847944498062134, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.6123046875, "rewards/ppl_reward/std": 2.975287437438965, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 164.109375, "completions/mean_terminated_length": 164.109375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.7298202863234846, "grad_norm": 1.3610130548477173, "kl": 3.8828125, "learning_rate": 1.0144729567755302e-05, "loss": 0.1054, "num_tokens": 40993447.0, "reward": -0.0841064453125, "reward_std": 0.443427711725235, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -3.840087890625, "rewards/ppl_reward/std": 1.9137296676635742, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13524486124515533, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 155.46875, "completions/mean_terminated_length": 155.46875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.7310386841303687, "grad_norm": 1.5976920127868652, "kl": 2.142578125, "learning_rate": 1.0136216606825693e-05, "loss": 0.0441, "num_tokens": 41009845.0, "reward": -0.902587890625, "reward_std": 0.24625781178474426, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.71923828125, "rewards/ppl_reward/std": 2.319676160812378, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 190.203125, "completions/mean_terminated_length": 190.203125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 2.7322570819372523, "grad_norm": 1.1508121490478516, "kl": 2.01171875, "learning_rate": 1.012770354715974e-05, "loss": -0.0055, "num_tokens": 41029018.0, "reward": -1.149658203125, "reward_std": 0.520170271396637, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.16650390625, "rewards/ppl_reward/std": 4.5783514976501465, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1118449866771698, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 160.421875, "completions/mean_terminated_length": 160.421875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 2.7334754797441363, "grad_norm": 1.7150682210922241, "kl": 4.7890625, "learning_rate": 1.0119190394928118e-05, "loss": 0.1056, "num_tokens": 41046421.0, "reward": -1.5472412109375, "reward_std": 1.1325820684432983, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.578857421875, "rewards/ppl_reward/std": 3.147862195968628, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.22479599714279175, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 149.265625, "completions/mean_terminated_length": 149.265625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.7346938775510203, "grad_norm": 2.300279378890991, "kl": 3.16015625, "learning_rate": 1.0110677156301565e-05, "loss": 0.0437, "num_tokens": 41062662.0, "reward": -0.709228515625, "reward_std": 0.6952553987503052, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.17626953125, "rewards/ppl_reward/std": 2.7109084129333496, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 161.234375, "completions/mean_terminated_length": 161.234375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 2.7359122753579044, "grad_norm": 1.605504035949707, "kl": 3.783203125, "learning_rate": 1.0102163837450889e-05, "loss": 0.1306, "num_tokens": 41079797.0, "reward": -0.5736083984375, "reward_std": 0.40324652194976807, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.990966796875, "rewards/ppl_reward/std": 1.8006588220596313, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 176.609375, "completions/mean_terminated_length": 176.609375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.7371306731647884, "grad_norm": 1.3720487356185913, "kl": 2.4072265625, "learning_rate": 1.0093650444546954e-05, "loss": 0.015, "num_tokens": 41098012.0, "reward": -0.1685791015625, "reward_std": 0.3383212685585022, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.149658203125, "rewards/ppl_reward/std": 1.0205440521240234, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 157.515625, "completions/mean_terminated_length": 157.515625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 2.7383490709716725, "grad_norm": 2.1083085536956787, "kl": 3.392578125, "learning_rate": 1.0085136983760676e-05, "loss": -0.0178, "num_tokens": 41115125.0, "reward": -0.8504638671875, "reward_std": 0.9651513695716858, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.364990234375, "rewards/ppl_reward/std": 2.135796546936035, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.21704266965389252, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 177.125, "completions/mean_terminated_length": 177.125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 2.739567468778556, "grad_norm": 1.3909862041473389, "kl": 2.482421875, "learning_rate": 1.0076623461263017e-05, "loss": 0.0189, "num_tokens": 41133261.0, "reward": -0.986572265625, "reward_std": 0.5392715930938721, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.76220703125, "rewards/ppl_reward/std": 3.467292308807373, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 159.9375, "completions/mean_terminated_length": 159.9375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.74078586658544, "grad_norm": 3.0249814987182617, "kl": 4.23046875, "learning_rate": 1.0068109883224992e-05, "loss": 0.0166, "num_tokens": 41150001.0, "reward": -2.9715576171875, "reward_std": 0.7646830081939697, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -9.708740234375, "rewards/ppl_reward/std": 8.085436820983887, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 151.078125, "completions/mean_terminated_length": 151.078125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.742004264392324, "grad_norm": 1.4520106315612793, "kl": 4.1953125, "learning_rate": 1.005959625581765e-05, "loss": 0.0534, "num_tokens": 41166302.0, "reward": -1.0390625, "reward_std": 0.5854737758636475, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.859375, "rewards/ppl_reward/std": 2.8429391384124756, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 170.578125, "completions/mean_terminated_length": 170.578125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 2.743222662199208, "grad_norm": 1.3314074277877808, "kl": 2.865234375, "learning_rate": 1.0051082585212075e-05, "loss": -0.0051, "num_tokens": 41184587.0, "reward": -2.1256103515625, "reward_std": 1.5923956632614136, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.048095703125, "rewards/ppl_reward/std": 8.659934043884277, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 179.328125, "completions/mean_terminated_length": 179.328125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 2.744441060006092, "grad_norm": 1.7925535440444946, "kl": 6.353515625, "learning_rate": 1.0042568877579388e-05, "loss": 0.222, "num_tokens": 41203056.0, "reward": -0.7791748046875, "reward_std": 0.7118465900421143, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.097412109375, "rewards/ppl_reward/std": 2.9922738075256348, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.15900352597236633, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 185.484375, "completions/mean_terminated_length": 185.484375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 2.745659457812976, "grad_norm": 1.9405955076217651, "kl": 6.39453125, "learning_rate": 1.0034055139090733e-05, "loss": 0.2865, "num_tokens": 41221799.0, "reward": -1.994384765625, "reward_std": 1.292538046836853, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.61376953125, "rewards/ppl_reward/std": 7.554040908813477, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1326993852853775, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 166.5625, "completions/mean_terminated_length": 166.5625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.74687785561986, "grad_norm": 1.6995823383331299, "kl": 4.123046875, "learning_rate": 1.0025541375917274e-05, "loss": 0.0492, "num_tokens": 41239555.0, "reward": -2.5318603515625, "reward_std": 2.0173559188842773, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.634033203125, "rewards/ppl_reward/std": 7.887838363647461, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.17102740705013275, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 180.421875, "completions/mean_terminated_length": 180.421875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.748096253426744, "grad_norm": 1.5125457048416138, "kl": 3.34375, "learning_rate": 1.00170275942302e-05, "loss": 0.0388, "num_tokens": 41258310.0, "reward": -0.6663818359375, "reward_std": 0.8411835432052612, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.989013671875, "rewards/ppl_reward/std": 2.712406873703003, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 182.1875, "completions/mean_terminated_length": 168.82540893554688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.749314651233628, "grad_norm": 1.5158770084381104, "kl": 8.796875, "learning_rate": 1.0008513800200707e-05, "loss": 0.4461, "num_tokens": 41276658.0, "reward": -0.4129638671875, "reward_std": 1.0613988637924194, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.302490234375, "rewards/ppl_reward/std": 2.636638879776001, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1925172060728073, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 178.59375, "completions/mean_terminated_length": 178.59375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 2.750533049040512, "grad_norm": 1.7239328622817993, "kl": 4.94921875, "learning_rate": 1e-05, "loss": 0.1199, "num_tokens": 41294904.0, "reward": -2.3206787109375, "reward_std": 1.5552513599395752, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.227294921875, "rewards/ppl_reward/std": 8.99393367767334, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21474508941173553, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 192.234375, "completions/mean_terminated_length": 192.234375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 2.7517514468473956, "grad_norm": 1.1715975999832153, "kl": 2.193359375, "learning_rate": 9.991486199799295e-06, "loss": 0.0743, "num_tokens": 41314215.0, "reward": -1.95166015625, "reward_std": 0.4271641969680786, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.7392578125, "rewards/ppl_reward/std": 4.200870513916016, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 186.640625, "completions/mean_terminated_length": 186.640625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.7529698446542796, "grad_norm": 1.7611528635025024, "kl": 2.677734375, "learning_rate": 9.982972405769803e-06, "loss": -0.0132, "num_tokens": 41333256.0, "reward": -0.778076171875, "reward_std": 0.3506307303905487, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.35302734375, "rewards/ppl_reward/std": 1.8828285932540894, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 188.8125, "completions/mean_terminated_length": 188.8125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 2.7541882424611637, "grad_norm": 1.3226374387741089, "kl": 1.7236328125, "learning_rate": 9.974458624082726e-06, "loss": 0.0512, "num_tokens": 41352348.0, "reward": -0.05230712890625, "reward_std": 0.2159881442785263, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -3.9483642578125, "rewards/ppl_reward/std": 1.7550647258758545, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 172.765625, "completions/mean_terminated_length": 172.765625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 2.7554066402680473, "grad_norm": 1.331827998161316, "kl": 0.94921875, "learning_rate": 9.965944860909267e-06, "loss": 0.0148, "num_tokens": 41369989.0, "reward": -1.9530029296875, "reward_std": 0.4719015061855316, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.859130859375, "rewards/ppl_reward/std": 7.688488483428955, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 177.703125, "completions/mean_terminated_length": 177.703125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 2.7566250380749313, "grad_norm": 2.769705295562744, "kl": 3.060546875, "learning_rate": 9.957431122420615e-06, "loss": 0.1487, "num_tokens": 41388170.0, "reward": -1.4447021484375, "reward_std": 1.0321974754333496, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.725341796875, "rewards/ppl_reward/std": 7.257603168487549, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 178.4375, "completions/mean_terminated_length": 178.4375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.7578434358818154, "grad_norm": 1.412352442741394, "kl": 1.97265625, "learning_rate": 9.948917414787928e-06, "loss": -0.0356, "num_tokens": 41406446.0, "reward": -1.37353515625, "reward_std": 0.4692175090312958, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.4970703125, "rewards/ppl_reward/std": 2.715620756149292, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 177.359375, "completions/mean_terminated_length": 177.359375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 2.7590618336886994, "grad_norm": 1.4295190572738647, "kl": 2.28125, "learning_rate": 9.940403744182355e-06, "loss": 0.0331, "num_tokens": 41424917.0, "reward": -0.692626953125, "reward_std": 0.3695480227470398, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.21337890625, "rewards/ppl_reward/std": 2.848447799682617, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 180.46875, "completions/mean_terminated_length": 180.46875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 2.7602802314955834, "grad_norm": 2.4572877883911133, "kl": 2.818359375, "learning_rate": 9.93189011677501e-06, "loss": 0.0959, "num_tokens": 41443467.0, "reward": -0.695556640625, "reward_std": 0.5985938310623169, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.03173828125, "rewards/ppl_reward/std": 1.8682043552398682, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 168.3125, "completions/mean_terminated_length": 168.3125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 2.7614986293024675, "grad_norm": 1.4392138719558716, "kl": 2.5439453125, "learning_rate": 9.923376538736985e-06, "loss": 0.0462, "num_tokens": 41460679.0, "reward": -1.2379150390625, "reward_std": 0.5888368487358093, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.241455078125, "rewards/ppl_reward/std": 3.1412389278411865, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 171.734375, "completions/mean_terminated_length": 171.734375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 2.762717027109351, "grad_norm": 2.2878777980804443, "kl": 5.20703125, "learning_rate": 9.914863016239327e-06, "loss": 0.2802, "num_tokens": 41478838.0, "reward": -1.20458984375, "reward_std": 1.3820774555206299, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.0419921875, "rewards/ppl_reward/std": 6.033958435058594, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13768701255321503, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 178.015625, "completions/mean_terminated_length": 178.015625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 2.763935424916235, "grad_norm": 1.3880326747894287, "kl": 2.3359375, "learning_rate": 9.906349555453049e-06, "loss": -0.0023, "num_tokens": 41497439.0, "reward": -0.2904052734375, "reward_std": 0.5014967918395996, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.322998046875, "rewards/ppl_reward/std": 2.3258512020111084, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 161.796875, "completions/mean_terminated_length": 161.796875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 2.765153822723119, "grad_norm": 1.3434867858886719, "kl": 1.8837890625, "learning_rate": 9.897836162549113e-06, "loss": 0.0158, "num_tokens": 41514250.0, "reward": -2.2352294921875, "reward_std": 0.41265350580215454, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.314208984375, "rewards/ppl_reward/std": 5.3264336585998535, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 186.5625, "completions/mean_terminated_length": 186.5625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.766372220530003, "grad_norm": 2.3257198333740234, "kl": 7.37109375, "learning_rate": 9.889322843698437e-06, "loss": 0.2748, "num_tokens": 41533238.0, "reward": -1.0035400390625, "reward_std": 0.8980644941329956, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.452392578125, "rewards/ppl_reward/std": 3.917853355407715, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16512493789196014, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 184.109375, "completions/mean_terminated_length": 184.109375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 2.767590618336887, "grad_norm": 1.4711604118347168, "kl": 6.3203125, "learning_rate": 9.880809605071889e-06, "loss": 0.3074, "num_tokens": 41552317.0, "reward": -2.1856689453125, "reward_std": 1.893555760383606, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.011962890625, "rewards/ppl_reward/std": 9.550517082214355, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 161.671875, "completions/mean_terminated_length": 161.671875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.768809016143771, "grad_norm": 1.1876585483551025, "kl": 2.8916015625, "learning_rate": 9.872296452840266e-06, "loss": -0.0043, "num_tokens": 41569272.0, "reward": -0.48114013671875, "reward_std": 0.47203436493873596, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.6497802734375, "rewards/ppl_reward/std": 3.079859733581543, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 159.703125, "completions/mean_terminated_length": 159.703125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.770027413950655, "grad_norm": 5.470752239227295, "kl": 10.8359375, "learning_rate": 9.86378339317431e-06, "loss": 0.4024, "num_tokens": 41586181.0, "reward": -2.33544921875, "reward_std": 2.147630214691162, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -7.8818359375, "rewards/ppl_reward/std": 8.004798889160156, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.2395833432674408, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 193.078125, "completions/mean_terminated_length": 179.88890075683594, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 2.771245811757539, "grad_norm": 1.824271321296692, "kl": 8.3427734375, "learning_rate": 9.855270432244699e-06, "loss": 0.6144, "num_tokens": 41605338.0, "reward": -2.4039306640625, "reward_std": 0.5506933927536011, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.620361328125, "rewards/ppl_reward/std": 6.23746919631958, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 151.546875, "completions/mean_terminated_length": 151.546875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 2.772464209564423, "grad_norm": 2.0250420570373535, "kl": 3.6669921875, "learning_rate": 9.846757576222038e-06, "loss": 0.0924, "num_tokens": 41622037.0, "reward": -3.7359619140625, "reward_std": 0.8543626666069031, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -11.237548828125, "rewards/ppl_reward/std": 11.249728202819824, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 150.984375, "completions/mean_terminated_length": 150.984375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.773682607371307, "grad_norm": 2.6278984546661377, "kl": 5.27734375, "learning_rate": 9.83824483127685e-06, "loss": 0.0551, "num_tokens": 41638068.0, "reward": -2.68896484375, "reward_std": 2.393831491470337, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -8.7294921875, "rewards/ppl_reward/std": 11.722334861755371, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2816905975341797, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 164.234375, "completions/mean_terminated_length": 164.234375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 2.7749010051781906, "grad_norm": 1.6442112922668457, "kl": 5.783203125, "learning_rate": 9.829732203579586e-06, "loss": 0.2342, "num_tokens": 41655563.0, "reward": -0.80224609375, "reward_std": 0.8338406682014465, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.1591796875, "rewards/ppl_reward/std": 2.2900290489196777, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.22736713290214539, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 155.84375, "completions/mean_terminated_length": 155.84375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.7761194029850746, "grad_norm": 2.706016778945923, "kl": 5.21484375, "learning_rate": 9.821219699300605e-06, "loss": 0.1307, "num_tokens": 41672769.0, "reward": -1.4755859375, "reward_std": 0.8495625257492065, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.482421875, "rewards/ppl_reward/std": 4.076318264007568, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.14773420989513397, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 157.28125, "completions/mean_terminated_length": 157.28125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.7773378007919587, "grad_norm": 1.6093329191207886, "kl": 5.765625, "learning_rate": 9.812707324610177e-06, "loss": 0.1738, "num_tokens": 41690003.0, "reward": -7.7569580078125, "reward_std": 1.770334243774414, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -19.045166015625, "rewards/ppl_reward/std": 29.459138870239258, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17251639068126678, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 154.953125, "completions/mean_terminated_length": 154.953125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.7785561985988423, "grad_norm": 2.0224711894989014, "kl": 5.7890625, "learning_rate": 9.80419508567849e-06, "loss": 0.1435, "num_tokens": 41707352.0, "reward": -1.61773681640625, "reward_std": 0.6448346376419067, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.8214111328125, "rewards/ppl_reward/std": 4.79619026184082, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 156.265625, "completions/mean_terminated_length": 156.265625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.7797745964057263, "grad_norm": 2.4416778087615967, "kl": 4.20703125, "learning_rate": 9.795682988675612e-06, "loss": 0.0687, "num_tokens": 41724641.0, "reward": -1.7178955078125, "reward_std": 1.2907065153121948, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.959228515625, "rewards/ppl_reward/std": 4.784212112426758, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17951758205890656, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 170.484375, "completions/mean_terminated_length": 170.484375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.7809929942126104, "grad_norm": 3.06247615814209, "kl": 3.1845703125, "learning_rate": 9.787171039771528e-06, "loss": 0.1, "num_tokens": 41743552.0, "reward": -0.36932373046875, "reward_std": 0.5559803247451782, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.4573974609375, "rewards/ppl_reward/std": 3.0118908882141113, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 149.75, "completions/mean_terminated_length": 149.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 2.7822113920194944, "grad_norm": 1.5146610736846924, "kl": 2.837890625, "learning_rate": 9.778659245136102e-06, "loss": 0.035, "num_tokens": 41759784.0, "reward": -1.1376953125, "reward_std": 0.567123293876648, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.908203125, "rewards/ppl_reward/std": 3.9545624256134033, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14471296966075897, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 155.453125, "completions/mean_terminated_length": 155.453125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.7834297898263785, "grad_norm": 2.4511539936065674, "kl": 3.212890625, "learning_rate": 9.770147610939098e-06, "loss": -0.0126, "num_tokens": 41776885.0, "reward": -4.32666015625, "reward_std": 0.8415392637252808, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -12.4267578125, "rewards/ppl_reward/std": 12.84316349029541, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 162.140625, "completions/mean_terminated_length": 162.140625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.7846481876332625, "grad_norm": 2.1300973892211914, "kl": 4.5859375, "learning_rate": 9.761636143350152e-06, "loss": 0.101, "num_tokens": 41794758.0, "reward": -2.2479248046875, "reward_std": 1.3838095664978027, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.948974609375, "rewards/ppl_reward/std": 4.0599751472473145, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.23239074647426605, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 155.71875, "completions/mean_terminated_length": 155.71875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.785866585440146, "grad_norm": 1.8659775257110596, "kl": 3.75390625, "learning_rate": 9.753124848538786e-06, "loss": 0.1223, "num_tokens": 41811588.0, "reward": -1.510498046875, "reward_std": 1.2415488958358765, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.50537109375, "rewards/ppl_reward/std": 5.168676853179932, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.18617255985736847, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 150.015625, "completions/mean_terminated_length": 150.015625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.78708498324703, "grad_norm": 1.4380717277526855, "kl": 2.953125, "learning_rate": 9.744613732674401e-06, "loss": -0.0025, "num_tokens": 41828317.0, "reward": -0.71307373046875, "reward_std": 0.9366261959075928, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.1605224609375, "rewards/ppl_reward/std": 4.0252366065979, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 170.515625, "completions/mean_terminated_length": 170.515625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.788303381053914, "grad_norm": 3.079429864883423, "kl": 5.828125, "learning_rate": 9.736102801926261e-06, "loss": 0.2935, "num_tokens": 41846166.0, "reward": -1.6903076171875, "reward_std": 0.8748419284820557, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.911865234375, "rewards/ppl_reward/std": 3.253798484802246, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.116794154047966, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 154.4375, "completions/mean_terminated_length": 154.4375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.7895217788607978, "grad_norm": 1.2419615983963013, "kl": 2.6884765625, "learning_rate": 9.72759206246349e-06, "loss": 0.0158, "num_tokens": 41862682.0, "reward": -1.309814453125, "reward_std": 0.7382804155349731, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.26806640625, "rewards/ppl_reward/std": 3.2432682514190674, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 154.1875, "completions/mean_terminated_length": 154.1875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.790740176667682, "grad_norm": 1.31978440284729, "kl": 3.96875, "learning_rate": 9.71908152045509e-06, "loss": 0.1695, "num_tokens": 41879622.0, "reward": -0.7330322265625, "reward_std": 0.7299603223800659, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.114501953125, "rewards/ppl_reward/std": 3.3557918071746826, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11672777682542801, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 157.359375, "completions/mean_terminated_length": 157.359375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 2.791958574474566, "grad_norm": 1.878118872642517, "kl": 3.70703125, "learning_rate": 9.710571182069907e-06, "loss": 0.1476, "num_tokens": 41896941.0, "reward": -1.9000244140625, "reward_std": 1.1253036260604858, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.503173828125, "rewards/ppl_reward/std": 5.5785369873046875, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 140.28125, "completions/mean_terminated_length": 140.28125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.79317697228145, "grad_norm": 2.2223892211914062, "kl": 5.134765625, "learning_rate": 9.702061053476642e-06, "loss": 0.1201, "num_tokens": 41912527.0, "reward": -2.71484375, "reward_std": 2.64253830909729, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.9453125, "rewards/ppl_reward/std": 10.24585247039795, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.21921011805534363, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 139.203125, "completions/mean_terminated_length": 139.203125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.794395370088334, "grad_norm": 2.6337242126464844, "kl": 6.4765625, "learning_rate": 9.693551140843848e-06, "loss": 0.2175, "num_tokens": 41928396.0, "reward": -1.04541015625, "reward_std": 1.0287902355194092, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -5.5283203125, "rewards/ppl_reward/std": 5.163981914520264, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1666666716337204, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 127.0625, "completions/mean_terminated_length": 127.0625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.795613767895218, "grad_norm": 1.748948335647583, "kl": 3.2265625, "learning_rate": 9.68504145033992e-06, "loss": 0.1645, "num_tokens": 41942440.0, "reward": -2.75714111328125, "reward_std": 3.100196123123169, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -9.1549072265625, "rewards/ppl_reward/std": 12.130396842956543, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 145.484375, "completions/mean_terminated_length": 145.484375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.7968321657021016, "grad_norm": 3.485121488571167, "kl": 7.6953125, "learning_rate": 9.676531988133086e-06, "loss": 0.2875, "num_tokens": 41959047.0, "reward": -2.44287109375, "reward_std": 0.9295220971107483, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -8.1826171875, "rewards/ppl_reward/std": 3.739157199859619, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.22146137058734894, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 159.8125, "completions/mean_terminated_length": 159.8125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.7980505635089856, "grad_norm": 1.8088098764419556, "kl": 5.1328125, "learning_rate": 9.668022760391417e-06, "loss": 0.2845, "num_tokens": 41976995.0, "reward": -1.0120849609375, "reward_std": 0.4701453745365143, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.805419921875, "rewards/ppl_reward/std": 2.3832011222839355, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 137.171875, "completions/mean_terminated_length": 137.171875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.7992689613158697, "grad_norm": 1.4289650917053223, "kl": 1.6220703125, "learning_rate": 9.659513773282815e-06, "loss": -0.0185, "num_tokens": 41992294.0, "reward": -0.6318359375, "reward_std": 0.6530433297157288, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.091796875, "rewards/ppl_reward/std": 2.7677414417266846, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16194961965084076, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 136.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 2.8004873591227537, "grad_norm": 2.2013661861419678, "kl": 6.65625, "learning_rate": 9.651005032974994e-06, "loss": 0.2443, "num_tokens": 42007630.0, "reward": -1.34716796875, "reward_std": 1.2267495393753052, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.1630859375, "rewards/ppl_reward/std": 3.590092420578003, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.233588308095932, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 148.359375, "completions/mean_terminated_length": 148.359375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 2.8017057569296373, "grad_norm": 1.9605827331542969, "kl": 3.3349609375, "learning_rate": 9.642496545635498e-06, "loss": 0.1151, "num_tokens": 42024837.0, "reward": -2.315185546875, "reward_std": 0.46886953711509705, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.39599609375, "rewards/ppl_reward/std": 10.99963665008545, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 149.078125, "completions/mean_terminated_length": 149.078125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.8029241547365213, "grad_norm": 2.269737958908081, "kl": 4.6796875, "learning_rate": 9.633988317431694e-06, "loss": 0.1395, "num_tokens": 42041882.0, "reward": -0.2535400390625, "reward_std": 0.8786778450012207, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -3.999267578125, "rewards/ppl_reward/std": 1.6385235786437988, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.26954248547554016, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 124.09375, "completions/mean_terminated_length": 124.09375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.8041425525434054, "grad_norm": 1.7372848987579346, "kl": 3.60546875, "learning_rate": 9.62548035453075e-06, "loss": 0.03, "num_tokens": 42056184.0, "reward": -0.5430908203125, "reward_std": 0.5531848669052124, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.867431640625, "rewards/ppl_reward/std": 2.5031538009643555, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 123.25, "completions/mean_terminated_length": 123.25, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.8053609503502894, "grad_norm": 1.5225504636764526, "kl": 4.076171875, "learning_rate": 9.616972663099648e-06, "loss": 0.0998, "num_tokens": 42070920.0, "reward": -0.8115234375, "reward_std": 0.5812044143676758, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.294921875, "rewards/ppl_reward/std": 2.027311325073242, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1110801100730896, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 126.765625, "completions/mean_terminated_length": 126.765625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 2.8065793481571735, "grad_norm": 2.1334927082061768, "kl": 3.4296875, "learning_rate": 9.608465249305172e-06, "loss": 0.0385, "num_tokens": 42086305.0, "reward": -1.6551513671875, "reward_std": 0.9195005893707275, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.021240234375, "rewards/ppl_reward/std": 4.704753875732422, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 130.203125, "completions/mean_terminated_length": 130.203125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 2.8077977459640575, "grad_norm": 2.6251490116119385, "kl": 3.3056640625, "learning_rate": 9.599958119313904e-06, "loss": 0.0559, "num_tokens": 42101598.0, "reward": -0.8184814453125, "reward_std": 0.42783403396606445, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.418212890625, "rewards/ppl_reward/std": 2.675459861755371, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09449111670255661, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 135.765625, "completions/mean_terminated_length": 135.765625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 2.809016143770941, "grad_norm": 1.7893301248550415, "kl": 2.740234375, "learning_rate": 9.591451279292222e-06, "loss": 0.0499, "num_tokens": 42117975.0, "reward": -0.845703125, "reward_std": 0.8752588033676147, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.36328125, "rewards/ppl_reward/std": 3.7102348804473877, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.2376670390367508, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 130.046875, "completions/mean_terminated_length": 130.046875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.810234541577825, "grad_norm": 2.2521214485168457, "kl": 6.15234375, "learning_rate": 9.582944735406295e-06, "loss": 0.224, "num_tokens": 42133442.0, "reward": -0.71441650390625, "reward_std": 0.5615939497947693, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.9366455078125, "rewards/ppl_reward/std": 2.739473819732666, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.13903142511844635, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 134.453125, "completions/mean_terminated_length": 134.453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.811452939384709, "grad_norm": 1.5340991020202637, "kl": 2.92578125, "learning_rate": 9.574438493822068e-06, "loss": -0.0116, "num_tokens": 42149319.0, "reward": -1.51055908203125, "reward_std": 0.9866740703582764, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.7789306640625, "rewards/ppl_reward/std": 4.579054355621338, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 133.015625, "completions/mean_terminated_length": 133.015625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 2.812671337191593, "grad_norm": 1.9039772748947144, "kl": 3.09375, "learning_rate": 9.565932560705277e-06, "loss": 0.0875, "num_tokens": 42164800.0, "reward": -1.239501953125, "reward_std": 0.4676958918571472, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.29931640625, "rewards/ppl_reward/std": 1.474914312362671, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 129.359375, "completions/mean_terminated_length": 129.359375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 2.813889734998477, "grad_norm": 1.7814215421676636, "kl": 2.5361328125, "learning_rate": 9.557426942221431e-06, "loss": 0.0247, "num_tokens": 42180119.0, "reward": -2.3238525390625, "reward_std": 0.47890573740005493, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.507080078125, "rewards/ppl_reward/std": 8.727935791015625, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 140.234375, "completions/mean_terminated_length": 140.234375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 2.815108132805361, "grad_norm": 1.5550318956375122, "kl": 4.81640625, "learning_rate": 9.548921644535815e-06, "loss": 0.2537, "num_tokens": 42196118.0, "reward": -5.7398681640625, "reward_std": 3.0312085151672363, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -15.135986328125, "rewards/ppl_reward/std": 25.154634475708008, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 125.5625, "completions/mean_terminated_length": 125.5625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.816326530612245, "grad_norm": 1.3802053928375244, "kl": 1.564453125, "learning_rate": 9.540416673813471e-06, "loss": -0.0844, "num_tokens": 42210386.0, "reward": -1.47705078125, "reward_std": 0.5867030620574951, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.7587890625, "rewards/ppl_reward/std": 3.5943493843078613, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 132.734375, "completions/mean_terminated_length": 132.734375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 2.817544928419129, "grad_norm": 2.2119994163513184, "kl": 2.380859375, "learning_rate": 9.531912036219214e-06, "loss": 0.0056, "num_tokens": 42225425.0, "reward": -2.959716796875, "reward_std": 1.5147590637207031, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -9.65380859375, "rewards/ppl_reward/std": 8.540157318115234, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 135.5625, "completions/mean_terminated_length": 135.5625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.818763326226013, "grad_norm": 2.0168795585632324, "kl": 3.349609375, "learning_rate": 9.523407737917616e-06, "loss": 0.1033, "num_tokens": 42240573.0, "reward": -1.475830078125, "reward_std": 0.7931597232818604, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.49853515625, "rewards/ppl_reward/std": 5.034292221069336, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 141.359375, "completions/mean_terminated_length": 141.359375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 2.8199817240328966, "grad_norm": 1.3299449682235718, "kl": 2.4267578125, "learning_rate": 9.514903785072998e-06, "loss": -0.0151, "num_tokens": 42257108.0, "reward": -0.8470458984375, "reward_std": 0.5934884548187256, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.436279296875, "rewards/ppl_reward/std": 3.0443034172058105, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 156.453125, "completions/mean_terminated_length": 156.453125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.8212001218397806, "grad_norm": 1.881885051727295, "kl": 3.79296875, "learning_rate": 9.506400183849439e-06, "loss": 0.1268, "num_tokens": 42274849.0, "reward": -1.7935791015625, "reward_std": 0.8117886781692505, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.274658203125, "rewards/ppl_reward/std": 5.230091571807861, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 131.953125, "completions/mean_terminated_length": 131.953125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.8224185196466647, "grad_norm": 1.9440921545028687, "kl": 3.744140625, "learning_rate": 9.49789694041075e-06, "loss": 0.0819, "num_tokens": 42289694.0, "reward": -8.14453125, "reward_std": 5.705875396728516, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -20.03125, "rewards/ppl_reward/std": 48.549591064453125, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.20009763538837433, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 152.859375, "completions/mean_terminated_length": 152.859375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.8236369174535487, "grad_norm": 1.8725553750991821, "kl": 4.1953125, "learning_rate": 9.489394060920496e-06, "loss": 0.1117, "num_tokens": 42306365.0, "reward": -1.5494384765625, "reward_std": 0.6588796973228455, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.731689453125, "rewards/ppl_reward/std": 2.40315842628479, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11404092609882355, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 146.671875, "completions/mean_terminated_length": 146.671875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 2.8248553152604323, "grad_norm": 2.1589996814727783, "kl": 3.62109375, "learning_rate": 9.48089155154197e-06, "loss": 0.0079, "num_tokens": 42322976.0, "reward": -2.010498046875, "reward_std": 1.247023344039917, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.63818359375, "rewards/ppl_reward/std": 6.988663673400879, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1677328199148178, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 164.1875, "completions/mean_terminated_length": 164.1875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.8260737130673164, "grad_norm": 1.9682022333145142, "kl": 4.216796875, "learning_rate": 9.472389418438204e-06, "loss": 0.1223, "num_tokens": 42340548.0, "reward": -1.5029296875, "reward_std": 0.4679976999759674, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.732421875, "rewards/ppl_reward/std": 2.562596082687378, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 144.265625, "completions/mean_terminated_length": 144.265625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.8272921108742004, "grad_norm": 1.862789511680603, "kl": 5.2578125, "learning_rate": 9.463887667771946e-06, "loss": 0.188, "num_tokens": 42357445.0, "reward": -0.52593994140625, "reward_std": 1.1935904026031494, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.5675048828125, "rewards/ppl_reward/std": 2.2934200763702393, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2280818521976471, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 172.859375, "completions/mean_terminated_length": 172.859375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 2.8285105086810844, "grad_norm": 6.573044300079346, "kl": 4.85546875, "learning_rate": 9.455386305705677e-06, "loss": 0.1696, "num_tokens": 42375892.0, "reward": -1.275634765625, "reward_std": 0.9300498962402344, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.13720703125, "rewards/ppl_reward/std": 3.592561721801758, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 155.40625, "completions/mean_terminated_length": 155.40625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 2.8297289064879685, "grad_norm": 1.6038559675216675, "kl": 4.3671875, "learning_rate": 9.446885338401597e-06, "loss": 0.0584, "num_tokens": 42393334.0, "reward": -2.543701171875, "reward_std": 1.690558671951294, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.52490234375, "rewards/ppl_reward/std": 5.4475507736206055, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2812775671482086, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 163.171875, "completions/mean_terminated_length": 163.171875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 2.8309473042948525, "grad_norm": 4.070890426635742, "kl": 6.83984375, "learning_rate": 9.438384772021609e-06, "loss": 0.1514, "num_tokens": 42410713.0, "reward": -1.7802734375, "reward_std": 1.4305561780929565, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.974609375, "rewards/ppl_reward/std": 5.011240005493164, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2665352523326874, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 169.046875, "completions/mean_terminated_length": 169.046875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.832165702101736, "grad_norm": 1.5043193101882935, "kl": 4.234375, "learning_rate": 9.42988461272734e-06, "loss": 0.1871, "num_tokens": 42428884.0, "reward": -0.10736083984375, "reward_std": 0.5364296436309814, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -3.8709716796875, "rewards/ppl_reward/std": 1.2140820026397705, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 163.375, "completions/mean_terminated_length": 163.375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 2.83338409990862, "grad_norm": 1.960160732269287, "kl": 3.484375, "learning_rate": 9.421384866680104e-06, "loss": 0.0523, "num_tokens": 42446580.0, "reward": -0.3785400390625, "reward_std": 0.7843049168586731, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.382080078125, "rewards/ppl_reward/std": 1.7469967603683472, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.23935678601264954, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 171.765625, "completions/mean_terminated_length": 171.765625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 2.834602497715504, "grad_norm": 1.5229545831680298, "kl": 3.03515625, "learning_rate": 9.412885540040931e-06, "loss": 0.1067, "num_tokens": 42464829.0, "reward": -1.8404541015625, "reward_std": 0.8282253742218018, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.368408203125, "rewards/ppl_reward/std": 3.1100733280181885, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.22271771728992462, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 169.3125, "completions/mean_terminated_length": 169.3125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 2.835820895522388, "grad_norm": 1.5188065767288208, "kl": 4.091796875, "learning_rate": 9.404386638970542e-06, "loss": 0.1308, "num_tokens": 42482121.0, "reward": -1.619140625, "reward_std": 0.9837497472763062, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.93359375, "rewards/ppl_reward/std": 4.118565559387207, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 152.203125, "completions/mean_terminated_length": 152.203125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.837039293329272, "grad_norm": 1.939868450164795, "kl": 3.765625, "learning_rate": 9.395888169629345e-06, "loss": 0.0196, "num_tokens": 42498166.0, "reward": -4.638427734375, "reward_std": 1.1116302013397217, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -12.85498046875, "rewards/ppl_reward/std": 19.781452178955078, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1110801100730896, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 160.59375, "completions/mean_terminated_length": 160.59375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.838257691136156, "grad_norm": 1.6308059692382812, "kl": 4.328125, "learning_rate": 9.387390138177447e-06, "loss": 0.1588, "num_tokens": 42515612.0, "reward": -1.57080078125, "reward_std": 1.1543147563934326, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.7900390625, "rewards/ppl_reward/std": 5.071411609649658, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1489359587430954, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 155.359375, "completions/mean_terminated_length": 155.359375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.83947608894304, "grad_norm": 2.5033490657806396, "kl": 3.515625, "learning_rate": 9.378892550774623e-06, "loss": 0.0828, "num_tokens": 42533227.0, "reward": -1.57489013671875, "reward_std": 0.5975927114486694, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.7591552734375, "rewards/ppl_reward/std": 3.3294856548309326, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 156.765625, "completions/mean_terminated_length": 156.765625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 2.840694486749924, "grad_norm": 2.5996172428131104, "kl": 3.7890625, "learning_rate": 9.370395413580336e-06, "loss": 0.0106, "num_tokens": 42550356.0, "reward": -1.8369140625, "reward_std": 1.567999005317688, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.158203125, "rewards/ppl_reward/std": 4.57133150100708, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.23239074647426605, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 148.71875, "completions/mean_terminated_length": 148.71875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.841912884556808, "grad_norm": 1.298749566078186, "kl": 2.548828125, "learning_rate": 9.361898732753715e-06, "loss": 0.0494, "num_tokens": 42566162.0, "reward": -0.9222412109375, "reward_std": 0.475996732711792, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.571044921875, "rewards/ppl_reward/std": 2.9120259284973145, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 156.765625, "completions/mean_terminated_length": 156.765625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.8431312823636916, "grad_norm": 2.9616153240203857, "kl": 2.841796875, "learning_rate": 9.353402514453573e-06, "loss": 0.0969, "num_tokens": 42583227.0, "reward": -0.94775390625, "reward_std": 0.6890791654586792, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.6533203125, "rewards/ppl_reward/std": 2.733045816421509, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 139.671875, "completions/mean_terminated_length": 139.671875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 2.8443496801705757, "grad_norm": 2.0905134677886963, "kl": 3.013671875, "learning_rate": 9.344906764838363e-06, "loss": -0.0026, "num_tokens": 42599054.0, "reward": -1.7899169921875, "reward_std": 0.8890513181686401, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.197021484375, "rewards/ppl_reward/std": 4.66384744644165, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17354662716388702, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 155.75, "completions/mean_terminated_length": 155.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 2.8455680779774597, "grad_norm": 2.631772994995117, "kl": 3.2421875, "learning_rate": 9.336411490066225e-06, "loss": 0.1327, "num_tokens": 42616486.0, "reward": -1.0892333984375, "reward_std": 0.6570883393287659, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.842529296875, "rewards/ppl_reward/std": 2.8579771518707275, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14471296966075897, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 157.28125, "completions/mean_terminated_length": 157.28125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.8467864757843437, "grad_norm": 2.208686590194702, "kl": 4.09765625, "learning_rate": 9.327916696294935e-06, "loss": 0.1358, "num_tokens": 42634200.0, "reward": -0.1319580078125, "reward_std": 0.5584392547607422, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.006103515625, "rewards/ppl_reward/std": 0.9117224216461182, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 142.390625, "completions/mean_terminated_length": 142.390625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 2.8480048735912273, "grad_norm": 3.6918015480041504, "kl": 5.2421875, "learning_rate": 9.319422389681928e-06, "loss": 0.1249, "num_tokens": 42650689.0, "reward": -1.3492431640625, "reward_std": 1.2793476581573486, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.339111328125, "rewards/ppl_reward/std": 3.5777721405029297, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 153.109375, "completions/mean_terminated_length": 153.109375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.8492232713981114, "grad_norm": 1.5119608640670776, "kl": 6.37109375, "learning_rate": 9.310928576384293e-06, "loss": 0.3499, "num_tokens": 42667312.0, "reward": -0.3909912109375, "reward_std": 0.6863993406295776, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.360107421875, "rewards/ppl_reward/std": 1.5944342613220215, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1508491188287735, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 141.703125, "completions/mean_terminated_length": 141.703125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.8504416692049954, "grad_norm": 3.597106456756592, "kl": 5.873046875, "learning_rate": 9.302435262558748e-06, "loss": 0.1469, "num_tokens": 42683253.0, "reward": -2.873779296875, "reward_std": 0.9658684730529785, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -9.25537109375, "rewards/ppl_reward/std": 3.715193033218384, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.15900352597236633, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 135.546875, "completions/mean_terminated_length": 135.546875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 2.8516600670118795, "grad_norm": 1.9719295501708984, "kl": 4.0390625, "learning_rate": 9.293942454361655e-06, "loss": 0.0545, "num_tokens": 42698656.0, "reward": -1.564453125, "reward_std": 0.6431083679199219, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.83203125, "rewards/ppl_reward/std": 5.255087375640869, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 136.625, "completions/mean_terminated_length": 136.625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.8528784648187635, "grad_norm": 1.5599747896194458, "kl": 2.86328125, "learning_rate": 9.285450157949015e-06, "loss": 0.0708, "num_tokens": 42713664.0, "reward": -2.158935546875, "reward_std": 0.920394778251648, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.95849609375, "rewards/ppl_reward/std": 6.808715343475342, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16347388923168182, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 131.3125, "completions/mean_terminated_length": 131.3125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.8540968626256475, "grad_norm": 2.671700954437256, "kl": 6.6484375, "learning_rate": 9.276958379476449e-06, "loss": 0.2216, "num_tokens": 42729084.0, "reward": -3.47747802734375, "reward_std": 1.0418095588684082, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -10.5487060546875, "rewards/ppl_reward/std": 10.135643005371094, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09449111670255661, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 125.890625, "completions/mean_terminated_length": 125.890625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 2.855315260432531, "grad_norm": 1.413689374923706, "kl": 3.791015625, "learning_rate": 9.268467125099201e-06, "loss": 0.0494, "num_tokens": 42743965.0, "reward": -4.4267578125, "reward_std": 1.3341279029846191, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -12.439453125, "rewards/ppl_reward/std": 14.279657363891602, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21931616961956024, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 154.875, "completions/mean_terminated_length": 154.875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.856533658239415, "grad_norm": 2.1846563816070557, "kl": 7.3984375, "learning_rate": 9.259976400972147e-06, "loss": 0.411, "num_tokens": 42762445.0, "reward": -3.6168212890625, "reward_std": 1.9012162685394287, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -10.788330078125, "rewards/ppl_reward/std": 11.125494003295898, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1953943371772766, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 135.15625, "completions/mean_terminated_length": 135.15625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.8577520560462992, "grad_norm": 1.7443828582763672, "kl": 4.693359375, "learning_rate": 9.251486213249773e-06, "loss": 0.1225, "num_tokens": 42777975.0, "reward": -0.93218994140625, "reward_std": 0.5478851795196533, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.5596923828125, "rewards/ppl_reward/std": 4.269169807434082, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 139.734375, "completions/mean_terminated_length": 139.734375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.858970453853183, "grad_norm": 1.9908090829849243, "kl": 3.55078125, "learning_rate": 9.24299656808617e-06, "loss": 0.0984, "num_tokens": 42794014.0, "reward": -1.3917236328125, "reward_std": 0.6863534450531006, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.439697265625, "rewards/ppl_reward/std": 3.283477783203125, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 138.234375, "completions/mean_terminated_length": 138.234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 2.860188851660067, "grad_norm": 2.8032283782958984, "kl": 8.140625, "learning_rate": 9.234507471635043e-06, "loss": 0.272, "num_tokens": 42810565.0, "reward": -4.5426025390625, "reward_std": 1.286310076713562, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -12.405517578125, "rewards/ppl_reward/std": 11.549595832824707, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2551248073577881, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 148.640625, "completions/mean_terminated_length": 148.640625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.861407249466951, "grad_norm": 2.2264652252197266, "kl": 5.5390625, "learning_rate": 9.2260189300497e-06, "loss": 0.2402, "num_tokens": 42827686.0, "reward": -1.039794921875, "reward_std": 0.5012452602386475, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.79052734375, "rewards/ppl_reward/std": 3.3517959117889404, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11672777682542801, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 136.78125, "completions/mean_terminated_length": 136.78125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.862625647273835, "grad_norm": 2.4094107151031494, "kl": 6.2421875, "learning_rate": 9.21753094948304e-06, "loss": 0.2144, "num_tokens": 42842848.0, "reward": -1.6214599609375, "reward_std": 1.0988566875457764, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.860107421875, "rewards/ppl_reward/std": 4.94542121887207, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21007457375526428, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 126.28125, "completions/mean_terminated_length": 126.28125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.863844045080719, "grad_norm": 1.6016916036605835, "kl": 4.15234375, "learning_rate": 9.209043536087566e-06, "loss": 0.1158, "num_tokens": 42857930.0, "reward": -2.3193359375, "reward_std": 1.9434428215026855, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.341796875, "rewards/ppl_reward/std": 9.05047607421875, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 133.40625, "completions/mean_terminated_length": 133.40625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.865062442887603, "grad_norm": 1.4724706411361694, "kl": 3.0703125, "learning_rate": 9.200556696015356e-06, "loss": 0.0462, "num_tokens": 42873068.0, "reward": -2.14630126953125, "reward_std": 1.138857126235962, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.9254150390625, "rewards/ppl_reward/std": 5.7519097328186035, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 126.609375, "completions/mean_terminated_length": 126.609375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.8662808406944866, "grad_norm": 2.036939859390259, "kl": 5.33203125, "learning_rate": 9.192070435418079e-06, "loss": 0.1494, "num_tokens": 42887483.0, "reward": -2.625732421875, "reward_std": 1.2085468769073486, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.79833984375, "rewards/ppl_reward/std": 9.720132827758789, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.22479599714279175, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 129.796875, "completions/mean_terminated_length": 129.796875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 2.8674992385013707, "grad_norm": 1.7834302186965942, "kl": 3.578125, "learning_rate": 9.183584760446987e-06, "loss": 0.0724, "num_tokens": 42902542.0, "reward": -2.1956787109375, "reward_std": 0.6649420857429504, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.110107421875, "rewards/ppl_reward/std": 4.956909656524658, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 135.984375, "completions/mean_terminated_length": 135.984375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.8687176363082547, "grad_norm": 2.0366673469543457, "kl": 4.09375, "learning_rate": 9.17509967725291e-06, "loss": 0.1828, "num_tokens": 42918133.0, "reward": -25.724365234375, "reward_std": 59.22269058227539, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -55.15185546875, "rewards/ppl_reward/std": 339.0070495605469, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 137.546875, "completions/mean_terminated_length": 137.546875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 2.8699360341151388, "grad_norm": 1.5834307670593262, "kl": 2.41796875, "learning_rate": 9.166615191986234e-06, "loss": 0.0163, "num_tokens": 42934000.0, "reward": -4.3140869140625, "reward_std": 1.8370407819747925, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -12.425048828125, "rewards/ppl_reward/std": 11.93088436126709, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 132.734375, "completions/mean_terminated_length": 132.734375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.8711544319220224, "grad_norm": 1.3108361959457397, "kl": 2.7607421875, "learning_rate": 9.15813131079693e-06, "loss": 0.0582, "num_tokens": 42949367.0, "reward": -2.0826416015625, "reward_std": 0.38792556524276733, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.977783203125, "rewards/ppl_reward/std": 4.611014366149902, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 150.078125, "completions/mean_terminated_length": 150.078125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.8723728297289064, "grad_norm": 2.0569233894348145, "kl": 3.26953125, "learning_rate": 9.149648039834524e-06, "loss": 0.1055, "num_tokens": 42965708.0, "reward": -1.5863037109375, "reward_std": 0.8115039467811584, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.821044921875, "rewards/ppl_reward/std": 5.665353775024414, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1846257895231247, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 152.6875, "completions/mean_terminated_length": 152.6875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.8735912275357904, "grad_norm": 1.626331090927124, "kl": 2.83203125, "learning_rate": 9.141165385248092e-06, "loss": 0.1156, "num_tokens": 42982672.0, "reward": -1.247314453125, "reward_std": 0.40802282094955444, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.33056640625, "rewards/ppl_reward/std": 3.8924946784973145, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 144.484375, "completions/mean_terminated_length": 144.484375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.8748096253426745, "grad_norm": 1.691327691078186, "kl": 5.5, "learning_rate": 9.132683353186276e-06, "loss": 0.2802, "num_tokens": 42998439.0, "reward": -1.2117919921875, "reward_std": 0.9546077251434326, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.025146484375, "rewards/ppl_reward/std": 1.875030279159546, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1876239776611328, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 147.1875, "completions/mean_terminated_length": 147.1875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 2.8760280231495585, "grad_norm": 2.3009731769561768, "kl": 5.9921875, "learning_rate": 9.124201949797254e-06, "loss": 0.23, "num_tokens": 43015387.0, "reward": -0.64361572265625, "reward_std": 0.7534195184707642, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.8341064453125, "rewards/ppl_reward/std": 2.0995559692382812, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21114179491996765, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 132.71875, "completions/mean_terminated_length": 132.71875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.877246420956442, "grad_norm": 1.8104358911514282, "kl": 4.60546875, "learning_rate": 9.11572118122876e-06, "loss": 0.0843, "num_tokens": 43030529.0, "reward": -0.6929931640625, "reward_std": 1.184357762336731, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.940673828125, "rewards/ppl_reward/std": 2.8176088333129883, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19024936854839325, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 148.515625, "completions/mean_terminated_length": 148.515625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.878464818763326, "grad_norm": 2.0564839839935303, "kl": 5.8125, "learning_rate": 9.107241053628058e-06, "loss": 0.3596, "num_tokens": 43047786.0, "reward": -1.7799072265625, "reward_std": 1.1366952657699585, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.372314453125, "rewards/ppl_reward/std": 6.43749475479126, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 140.453125, "completions/mean_terminated_length": 140.453125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.87968321657021, "grad_norm": 2.236104965209961, "kl": 5.1416015625, "learning_rate": 9.098761573141957e-06, "loss": 0.1911, "num_tokens": 43063431.0, "reward": -1.628662109375, "reward_std": 0.615028440952301, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.89794921875, "rewards/ppl_reward/std": 3.482375383377075, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1508491188287735, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 137.84375, "completions/mean_terminated_length": 137.84375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.8809016143770942, "grad_norm": 1.6969571113586426, "kl": 5.73046875, "learning_rate": 9.09028274591679e-06, "loss": 0.1898, "num_tokens": 43078989.0, "reward": -1.394287109375, "reward_std": 1.4763133525848389, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.35888671875, "rewards/ppl_reward/std": 3.678176164627075, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2649018466472626, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 141.9375, "completions/mean_terminated_length": 141.9375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 2.882120012183978, "grad_norm": 1.8500808477401733, "kl": 6.23828125, "learning_rate": 9.08180457809842e-06, "loss": 0.1918, "num_tokens": 43095409.0, "reward": -0.8370361328125, "reward_std": 1.4075852632522583, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.158447265625, "rewards/ppl_reward/std": 2.609097957611084, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.27716949582099915, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 153.8125, "completions/mean_terminated_length": 153.8125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 2.883338409990862, "grad_norm": 2.801915407180786, "kl": 5.728515625, "learning_rate": 9.07332707583223e-06, "loss": 0.2329, "num_tokens": 43112605.0, "reward": -0.4124755859375, "reward_std": 0.4530155658721924, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.520263671875, "rewards/ppl_reward/std": 2.075479507446289, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 135.734375, "completions/mean_terminated_length": 135.734375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 2.884556807797746, "grad_norm": 2.752185583114624, "kl": 6.73828125, "learning_rate": 9.064850245263118e-06, "loss": 0.2521, "num_tokens": 43127956.0, "reward": -1.155029296875, "reward_std": 0.8013705611228943, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.82568359375, "rewards/ppl_reward/std": 2.884382724761963, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 140.640625, "completions/mean_terminated_length": 140.640625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.88577520560463, "grad_norm": 1.7430649995803833, "kl": 5.36328125, "learning_rate": 9.056374092535504e-06, "loss": 0.1245, "num_tokens": 43143605.0, "reward": -2.4022216796875, "reward_std": 2.3817718029022217, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.398193359375, "rewards/ppl_reward/std": 9.807530403137207, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1666666716337204, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 140.265625, "completions/mean_terminated_length": 140.265625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 2.886993603411514, "grad_norm": 3.5387589931488037, "kl": 5.8046875, "learning_rate": 9.047898623793306e-06, "loss": 0.1774, "num_tokens": 43159918.0, "reward": -3.4544677734375, "reward_std": 1.1499488353729248, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -10.455810546875, "rewards/ppl_reward/std": 12.205350875854492, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21114179491996765, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 137.71875, "completions/mean_terminated_length": 137.71875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 2.888212001218398, "grad_norm": 1.3522961139678955, "kl": 3.1171875, "learning_rate": 9.039423845179954e-06, "loss": 0.0664, "num_tokens": 43175708.0, "reward": -0.0050048828125, "reward_std": 0.39651137590408325, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -3.728759765625, "rewards/ppl_reward/std": 0.9844374656677246, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 130.34375, "completions/mean_terminated_length": 130.34375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 2.8894303990252816, "grad_norm": 2.1371021270751953, "kl": 3.61328125, "learning_rate": 9.030949762838371e-06, "loss": 0.0588, "num_tokens": 43190842.0, "reward": -0.8502197265625, "reward_std": 0.7067540884017944, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.356689453125, "rewards/ppl_reward/std": 2.105600118637085, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 152.9375, "completions/mean_terminated_length": 152.9375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.8906487968321657, "grad_norm": 1.5010937452316284, "kl": 3.34765625, "learning_rate": 9.022476382910983e-06, "loss": 0.0639, "num_tokens": 43208406.0, "reward": -0.7161865234375, "reward_std": 0.8416204452514648, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.197998046875, "rewards/ppl_reward/std": 3.09834885597229, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16194961965084076, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 146.546875, "completions/mean_terminated_length": 146.546875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 2.8918671946390497, "grad_norm": 2.1244518756866455, "kl": 3.0009765625, "learning_rate": 9.014003711539704e-06, "loss": 0.129, "num_tokens": 43224793.0, "reward": -2.09521484375, "reward_std": 0.7297884225845337, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.0029296875, "rewards/ppl_reward/std": 3.2689523696899414, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 136.859375, "completions/mean_terminated_length": 136.859375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 2.8930855924459333, "grad_norm": 2.16943097114563, "kl": 4.90234375, "learning_rate": 9.005531754865929e-06, "loss": 0.1597, "num_tokens": 43241088.0, "reward": -1.9796142578125, "reward_std": 0.916705846786499, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.584228515625, "rewards/ppl_reward/std": 7.60305643081665, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1326993852853775, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 139.1875, "completions/mean_terminated_length": 139.1875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.8943039902528174, "grad_norm": 1.5247771739959717, "kl": 2.6669921875, "learning_rate": 8.997060519030543e-06, "loss": 0.0348, "num_tokens": 43256804.0, "reward": -0.2138671875, "reward_std": 0.2895717918872833, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.224609375, "rewards/ppl_reward/std": 1.246250867843628, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 139.046875, "completions/mean_terminated_length": 139.046875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 2.8955223880597014, "grad_norm": 2.2717947959899902, "kl": 7.05078125, "learning_rate": 8.988590010173906e-06, "loss": 0.2575, "num_tokens": 43272431.0, "reward": -0.2352294921875, "reward_std": 0.7829129695892334, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -3.907958984375, "rewards/ppl_reward/std": 1.7321161031723022, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 137.78125, "completions/mean_terminated_length": 137.78125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.8967407858665855, "grad_norm": 2.0650105476379395, "kl": 5.0, "learning_rate": 8.98012023443585e-06, "loss": 0.1681, "num_tokens": 43288305.0, "reward": -1.648681640625, "reward_std": 1.0436880588531494, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.89111328125, "rewards/ppl_reward/std": 3.2660813331604004, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.125, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 129.9375, "completions/mean_terminated_length": 129.9375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 2.8979591836734695, "grad_norm": 2.1339375972747803, "kl": 3.625, "learning_rate": 8.971651197955669e-06, "loss": 0.0779, "num_tokens": 43303269.0, "reward": -1.02728271484375, "reward_std": 0.5122666358947754, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.7811279296875, "rewards/ppl_reward/std": 4.552656650543213, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.8991775814803535, "grad_norm": 3.0309720039367676, "kl": 5.66015625, "learning_rate": 8.963182906872134e-06, "loss": 0.1511, "num_tokens": 43319925.0, "reward": -1.474609375, "reward_std": 1.051149845123291, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.53515625, "rewards/ppl_reward/std": 2.8901753425598145, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19024936854839325, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 146.5625, "completions/mean_terminated_length": 146.5625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 2.900395979287237, "grad_norm": 1.292016625404358, "kl": 3.0546875, "learning_rate": 8.954715367323468e-06, "loss": 0.046, "num_tokens": 43336353.0, "reward": -0.082763671875, "reward_std": 0.4571814239025116, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -3.90771484375, "rewards/ppl_reward/std": 1.2961971759796143, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11672777682542801, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 148.46875, "completions/mean_terminated_length": 148.46875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.901614377094121, "grad_norm": 3.5158960819244385, "kl": 4.703125, "learning_rate": 8.946248585447347e-06, "loss": 0.1078, "num_tokens": 43353575.0, "reward": -1.152587890625, "reward_std": 0.5904366970062256, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.03173828125, "rewards/ppl_reward/std": 6.572294235229492, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.902832774901005, "grad_norm": 1.3638125658035278, "kl": 2.7177734375, "learning_rate": 8.937782567380908e-06, "loss": 0.0665, "num_tokens": 43371807.0, "reward": -3.3836669921875, "reward_std": 0.6158331632614136, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -10.517333984375, "rewards/ppl_reward/std": 9.984901428222656, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 144.921875, "completions/mean_terminated_length": 144.921875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 2.9040511727078893, "grad_norm": 1.4550539255142212, "kl": 4.0986328125, "learning_rate": 8.929317319260727e-06, "loss": 0.1678, "num_tokens": 43388434.0, "reward": -1.0118408203125, "reward_std": 0.7280527353286743, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.718994140625, "rewards/ppl_reward/std": 2.7965357303619385, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.21704266965389252, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 132.5, "completions/mean_terminated_length": 132.5, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 2.905269570514773, "grad_norm": 1.6646034717559814, "kl": 3.0390625, "learning_rate": 8.920852847222816e-06, "loss": 0.1018, "num_tokens": 43403010.0, "reward": -2.45758056640625, "reward_std": 1.3457825183868408, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.7432861328125, "rewards/ppl_reward/std": 7.66194486618042, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 153.609375, "completions/mean_terminated_length": 153.609375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.906487968321657, "grad_norm": 1.971549391746521, "kl": 3.095703125, "learning_rate": 8.91238915740264e-06, "loss": 0.1428, "num_tokens": 43420057.0, "reward": -0.0853271484375, "reward_std": 0.4525564908981323, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -3.944091796875, "rewards/ppl_reward/std": 0.9734939336776733, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 139.203125, "completions/mean_terminated_length": 139.203125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.907706366128541, "grad_norm": 8.770218849182129, "kl": 4.38671875, "learning_rate": 8.903926255935085e-06, "loss": 0.0971, "num_tokens": 43435790.0, "reward": -0.58526611328125, "reward_std": 0.652854323387146, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.8033447265625, "rewards/ppl_reward/std": 2.0474700927734375, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1302827149629593, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 145.84375, "completions/mean_terminated_length": 145.84375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.908924763935425, "grad_norm": 3.512434482574463, "kl": 6.6875, "learning_rate": 8.895464148954465e-06, "loss": 0.1841, "num_tokens": 43452332.0, "reward": -1.4381103515625, "reward_std": 0.6980472803115845, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.298095703125, "rewards/ppl_reward/std": 2.8957293033599854, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.21463678777217865, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 137.953125, "completions/mean_terminated_length": 137.953125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.910143161742309, "grad_norm": 1.6656136512756348, "kl": 4.7421875, "learning_rate": 8.887002842594524e-06, "loss": 0.1014, "num_tokens": 43467881.0, "reward": -1.12158203125, "reward_std": 0.5550721883773804, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.9072265625, "rewards/ppl_reward/std": 3.3847203254699707, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 148.484375, "completions/mean_terminated_length": 148.484375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.911361559549193, "grad_norm": 1.6335994005203247, "kl": 4.61328125, "learning_rate": 8.878542342988428e-06, "loss": 0.2632, "num_tokens": 43484440.0, "reward": -1.81396484375, "reward_std": 1.17043137550354, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.3935546875, "rewards/ppl_reward/std": 5.641400337219238, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 126.703125, "completions/mean_terminated_length": 126.703125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.9125799573560767, "grad_norm": 2.4425196647644043, "kl": 4.140625, "learning_rate": 8.870082656268746e-06, "loss": 0.104, "num_tokens": 43499093.0, "reward": -1.9925537109375, "reward_std": 1.5216076374053955, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.586669921875, "rewards/ppl_reward/std": 6.821601390838623, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18225978314876556, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 134.359375, "completions/mean_terminated_length": 134.359375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 2.9137983551629607, "grad_norm": 1.6508427858352661, "kl": 4.7109375, "learning_rate": 8.86162378856747e-06, "loss": 0.2066, "num_tokens": 43514500.0, "reward": -2.5345458984375, "reward_std": 2.2067089080810547, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.787841796875, "rewards/ppl_reward/std": 9.809233665466309, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 129.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.9150167529698447, "grad_norm": 1.9547277688980103, "kl": 3.1953125, "learning_rate": 8.853165746015997e-06, "loss": 0.0643, "num_tokens": 43528628.0, "reward": -2.8846435546875, "reward_std": 0.9871774315834045, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -9.363037109375, "rewards/ppl_reward/std": 7.15439510345459, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14689241349697113, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 131.0, "completions/mean_terminated_length": 131.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 2.9162351507767283, "grad_norm": 1.8499040603637695, "kl": 4.078125, "learning_rate": 8.844708534745117e-06, "loss": 0.0984, "num_tokens": 43543828.0, "reward": -1.5380859375, "reward_std": 0.9621394276618958, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.693359375, "rewards/ppl_reward/std": 3.8610012531280518, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15545432269573212, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 133.59375, "completions/mean_terminated_length": 133.59375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.9174535485836124, "grad_norm": 1.7404764890670776, "kl": 3.2265625, "learning_rate": 8.836252160885029e-06, "loss": 0.0517, "num_tokens": 43559266.0, "reward": -1.6378173828125, "reward_std": 1.064690351486206, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.970947265625, "rewards/ppl_reward/std": 4.786554336547852, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 140.59375, "completions/mean_terminated_length": 140.59375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 2.9186719463904964, "grad_norm": 1.959873080253601, "kl": 3.416015625, "learning_rate": 8.827796630565313e-06, "loss": 0.0747, "num_tokens": 43575464.0, "reward": -1.2569580078125, "reward_std": 0.9338737726211548, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.271728515625, "rewards/ppl_reward/std": 4.859750270843506, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 153.984375, "completions/mean_terminated_length": 153.984375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 2.9198903441973805, "grad_norm": 2.4373974800109863, "kl": 5.462890625, "learning_rate": 8.819341949914947e-06, "loss": 0.1957, "num_tokens": 43593295.0, "reward": -0.7357177734375, "reward_std": 0.7217285633087158, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.072998046875, "rewards/ppl_reward/std": 1.588545560836792, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.23854589462280273, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 136.359375, "completions/mean_terminated_length": 136.359375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.9211087420042645, "grad_norm": 1.5911632776260376, "kl": 4.80078125, "learning_rate": 8.81088812506229e-06, "loss": 0.1641, "num_tokens": 43608662.0, "reward": -1.852783203125, "reward_std": 1.5352859497070312, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.29931640625, "rewards/ppl_reward/std": 3.6830849647521973, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18898223340511322, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 143.4375, "completions/mean_terminated_length": 143.4375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.9223271398111486, "grad_norm": 1.7728209495544434, "kl": 5.05859375, "learning_rate": 8.80243516213508e-06, "loss": 0.1546, "num_tokens": 43625114.0, "reward": -1.2337646484375, "reward_std": 0.6841549873352051, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.053466796875, "rewards/ppl_reward/std": 1.995176911354065, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15545432269573212, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 132.71875, "completions/mean_terminated_length": 132.71875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 2.923545537618032, "grad_norm": 1.9309537410736084, "kl": 5.07421875, "learning_rate": 8.793983067260434e-06, "loss": 0.157, "num_tokens": 43640184.0, "reward": -3.1962890625, "reward_std": 1.8397843837738037, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -10.056640625, "rewards/ppl_reward/std": 9.359458923339844, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1302827149629593, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 123.3125, "completions/mean_terminated_length": 123.3125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.924763935424916, "grad_norm": 1.6777905225753784, "kl": 3.08203125, "learning_rate": 8.785531846564832e-06, "loss": 0.0641, "num_tokens": 43654436.0, "reward": -1.1907958984375, "reward_std": 0.48914650082588196, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.155029296875, "rewards/ppl_reward/std": 1.6850947141647339, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 137.421875, "completions/mean_terminated_length": 137.421875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 2.9259823332318002, "grad_norm": 2.7079527378082275, "kl": 5.837890625, "learning_rate": 8.777081506174127e-06, "loss": 0.1482, "num_tokens": 43671071.0, "reward": -3.15625, "reward_std": 3.086881637573242, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -9.703125, "rewards/ppl_reward/std": 13.26193618774414, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.26246222853660583, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 145.78125, "completions/mean_terminated_length": 145.78125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.9272007310386843, "grad_norm": 1.5598353147506714, "kl": 4.4765625, "learning_rate": 8.768632052213532e-06, "loss": 0.0802, "num_tokens": 43688249.0, "reward": -1.6578369140625, "reward_std": 0.8397162556648254, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.846923828125, "rewards/ppl_reward/std": 2.9582433700561523, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1985812783241272, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 131.09375, "completions/mean_terminated_length": 131.09375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.928419128845568, "grad_norm": 2.2789459228515625, "kl": 4.98828125, "learning_rate": 8.76018349080762e-06, "loss": 0.1189, "num_tokens": 43703335.0, "reward": -2.975341796875, "reward_std": 1.393752932548523, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -9.66943359375, "rewards/ppl_reward/std": 7.934266090393066, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 121.609375, "completions/mean_terminated_length": 121.609375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.929637526652452, "grad_norm": 1.9031609296798706, "kl": 3.1611328125, "learning_rate": 8.751735828080308e-06, "loss": 0.0067, "num_tokens": 43717926.0, "reward": -0.72705078125, "reward_std": 0.6059532165527344, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.1806640625, "rewards/ppl_reward/std": 2.7139406204223633, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 134.828125, "completions/mean_terminated_length": 134.828125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.930855924459336, "grad_norm": 1.8352279663085938, "kl": 5.38671875, "learning_rate": 8.743289070154875e-06, "loss": 0.1482, "num_tokens": 43733795.0, "reward": -0.578369140625, "reward_std": 0.648444414138794, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.79736328125, "rewards/ppl_reward/std": 3.1343255043029785, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 137.9375, "completions/mean_terminated_length": 137.9375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.93207432226622, "grad_norm": 3.1073062419891357, "kl": 3.638671875, "learning_rate": 8.734843223153931e-06, "loss": 0.1233, "num_tokens": 43749511.0, "reward": -1.2469482421875, "reward_std": 0.5988537073135376, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.126708984375, "rewards/ppl_reward/std": 2.4460625648498535, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 2.933292720073104, "grad_norm": 2.6840834617614746, "kl": 4.75390625, "learning_rate": 8.726398293199434e-06, "loss": 0.2027, "num_tokens": 43764959.0, "reward": -3.2362060546875, "reward_std": 1.810822606086731, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -10.011474609375, "rewards/ppl_reward/std": 13.03958511352539, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 128.46875, "completions/mean_terminated_length": 128.46875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.934511117879988, "grad_norm": 1.4692432880401611, "kl": 2.8564453125, "learning_rate": 8.71795428641268e-06, "loss": 0.0726, "num_tokens": 43779861.0, "reward": -1.698486328125, "reward_std": 0.6898599863052368, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.12353515625, "rewards/ppl_reward/std": 4.7711920738220215, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 137.25, "completions/mean_terminated_length": 137.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.9357295156868717, "grad_norm": 1.3884660005569458, "kl": 1.982421875, "learning_rate": 8.709511208914282e-06, "loss": 0.0513, "num_tokens": 43795317.0, "reward": -0.529052734375, "reward_std": 0.5298863649368286, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.90185546875, "rewards/ppl_reward/std": 1.8863751888275146, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 148.71875, "completions/mean_terminated_length": 148.71875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 2.9369479134937557, "grad_norm": 2.354578733444214, "kl": 5.0625, "learning_rate": 8.701069066824195e-06, "loss": 0.206, "num_tokens": 43812435.0, "reward": -2.625732421875, "reward_std": 1.8076863288879395, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.89208984375, "rewards/ppl_reward/std": 10.328110694885254, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 149.03125, "completions/mean_terminated_length": 149.03125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.9381663113006398, "grad_norm": 1.5714950561523438, "kl": 3.837890625, "learning_rate": 8.69262786626169e-06, "loss": 0.1493, "num_tokens": 43830277.0, "reward": -0.8623046875, "reward_std": 0.4933188557624817, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.490234375, "rewards/ppl_reward/std": 2.76067852973938, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 128.65625, "completions/mean_terminated_length": 128.65625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.9393847091075234, "grad_norm": 1.544945478439331, "kl": 4.375, "learning_rate": 8.684187613345356e-06, "loss": 0.0868, "num_tokens": 43845599.0, "reward": -0.13427734375, "reward_std": 0.6066715717315674, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -3.7529296875, "rewards/ppl_reward/std": 1.211798071861267, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 125.171875, "completions/mean_terminated_length": 125.171875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.9406031069144074, "grad_norm": 1.7812037467956543, "kl": 3.76171875, "learning_rate": 8.675748314193086e-06, "loss": 0.1011, "num_tokens": 43860018.0, "reward": -0.6551513671875, "reward_std": 0.5178900957107544, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.075927734375, "rewards/ppl_reward/std": 2.455533504486084, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 146.375, "completions/mean_terminated_length": 132.4444580078125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.9418215047212914, "grad_norm": 1.3557703495025635, "kl": 2.1083984375, "learning_rate": 8.667309974922094e-06, "loss": 0.2014, "num_tokens": 43875938.0, "reward": -2.940185546875, "reward_std": 1.7238703966140747, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -9.62255859375, "rewards/ppl_reward/std": 15.086453437805176, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 142.515625, "completions/mean_terminated_length": 142.515625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.9430399025281755, "grad_norm": 1.5750776529312134, "kl": 2.8359375, "learning_rate": 8.658872601648896e-06, "loss": 0.0642, "num_tokens": 43892043.0, "reward": -1.02197265625, "reward_std": 0.5887621641159058, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.6923828125, "rewards/ppl_reward/std": 2.6376490592956543, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1677328199148178, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 122.84375, "completions/mean_terminated_length": 122.84375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 2.9442583003350595, "grad_norm": 2.3634562492370605, "kl": 2.5048828125, "learning_rate": 8.650436200489303e-06, "loss": 0.0909, "num_tokens": 43906017.0, "reward": -1.7392578125, "reward_std": 1.7294750213623047, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.251953125, "rewards/ppl_reward/std": 10.227588653564453, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 146.953125, "completions/mean_terminated_length": 146.953125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 2.9454766981419436, "grad_norm": 2.0822269916534424, "kl": 2.1650390625, "learning_rate": 8.642000777558429e-06, "loss": 0.069, "num_tokens": 43922782.0, "reward": -0.7591552734375, "reward_std": 0.32948124408721924, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.369873046875, "rewards/ppl_reward/std": 2.2412526607513428, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 136.71875, "completions/mean_terminated_length": 136.71875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.946695095948827, "grad_norm": 1.7405658960342407, "kl": 2.880859375, "learning_rate": 8.633566338970666e-06, "loss": 0.1, "num_tokens": 43938500.0, "reward": -4.049560546875, "reward_std": 0.8052377700805664, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -11.88818359375, "rewards/ppl_reward/std": 11.786751747131348, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 139.921875, "completions/mean_terminated_length": 139.921875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 2.947913493755711, "grad_norm": 2.142509937286377, "kl": 4.2578125, "learning_rate": 8.625132890839706e-06, "loss": 0.1507, "num_tokens": 43954087.0, "reward": -0.83917236328125, "reward_std": 0.837965726852417, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.3658447265625, "rewards/ppl_reward/std": 3.883533477783203, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 132.234375, "completions/mean_terminated_length": 132.234375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.9491318915625953, "grad_norm": 1.7731523513793945, "kl": 4.6328125, "learning_rate": 8.616700439278518e-06, "loss": 0.2073, "num_tokens": 43970078.0, "reward": -0.9554443359375, "reward_std": 0.5789105892181396, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.614013671875, "rewards/ppl_reward/std": 2.3083853721618652, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 125.96875, "completions/mean_terminated_length": 125.96875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.9503502893694793, "grad_norm": 5.173728942871094, "kl": 4.9921875, "learning_rate": 8.60826899039935e-06, "loss": 0.1756, "num_tokens": 43984804.0, "reward": -3.0025634765625, "reward_std": 1.241004228591919, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -9.591064453125, "rewards/ppl_reward/std": 7.808615684509277, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1677328199148178, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 121.75, "completions/mean_terminated_length": 121.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 2.951568687176363, "grad_norm": 1.815753698348999, "kl": 3.4794921875, "learning_rate": 8.599838550313714e-06, "loss": 0.071, "num_tokens": 43998780.0, "reward": -2.3062744140625, "reward_std": 1.3696699142456055, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.245361328125, "rewards/ppl_reward/std": 5.123347759246826, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 133.5, "completions/mean_terminated_length": 133.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 2.952787084983247, "grad_norm": 2.1055691242218018, "kl": 3.701171875, "learning_rate": 8.5914091251324e-06, "loss": 0.0172, "num_tokens": 44014028.0, "reward": -4.6121826171875, "reward_std": 1.5845677852630615, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -12.950927734375, "rewards/ppl_reward/std": 20.084827423095703, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 139.65625, "completions/mean_terminated_length": 139.65625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 2.954005482790131, "grad_norm": 1.8602575063705444, "kl": 5.7109375, "learning_rate": 8.582980720965465e-06, "loss": 0.1746, "num_tokens": 44029622.0, "reward": -2.604736328125, "reward_std": 1.6469783782958984, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.78759765625, "rewards/ppl_reward/std": 6.109748840332031, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15728822350502014, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 146.5, "completions/mean_terminated_length": 146.5, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.955223880597015, "grad_norm": 1.6541301012039185, "kl": 4.7412109375, "learning_rate": 8.574553343922221e-06, "loss": 0.1911, "num_tokens": 44045910.0, "reward": -0.472900390625, "reward_std": 0.4243168532848358, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.64111328125, "rewards/ppl_reward/std": 3.6943583488464355, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1302827149629593, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 132.90625, "completions/mean_terminated_length": 132.90625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.956442278403899, "grad_norm": 3.3053689002990723, "kl": 4.736328125, "learning_rate": 8.56612700011123e-06, "loss": 0.0714, "num_tokens": 44061088.0, "reward": -2.220703125, "reward_std": 1.1920031309127808, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.16796875, "rewards/ppl_reward/std": 6.577654838562012, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 125.84375, "completions/mean_terminated_length": 125.84375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 2.9576606762107827, "grad_norm": 1.8237628936767578, "kl": 2.18359375, "learning_rate": 8.557701695640321e-06, "loss": 0.0441, "num_tokens": 44075822.0, "reward": -1.476318359375, "reward_std": 1.183983325958252, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.83544921875, "rewards/ppl_reward/std": 4.3974199295043945, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 141.84375, "completions/mean_terminated_length": 141.84375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.9588790740176667, "grad_norm": 1.824920892715454, "kl": 2.90625, "learning_rate": 8.549277436616551e-06, "loss": 0.1153, "num_tokens": 44091524.0, "reward": -0.790283203125, "reward_std": 0.38362184166908264, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.40869140625, "rewards/ppl_reward/std": 2.6789026260375977, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 130.421875, "completions/mean_terminated_length": 130.421875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 2.9600974718245507, "grad_norm": 1.7971543073654175, "kl": 2.1708984375, "learning_rate": 8.54085422914623e-06, "loss": 0.0953, "num_tokens": 44105943.0, "reward": -2.4822998046875, "reward_std": 0.5284498929977417, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.753662109375, "rewards/ppl_reward/std": 7.110902786254883, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 153.3125, "completions/mean_terminated_length": 153.3125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.961315869631435, "grad_norm": 2.057184934616089, "kl": 4.0400390625, "learning_rate": 8.53243207933491e-06, "loss": 0.1358, "num_tokens": 44122987.0, "reward": -0.3275146484375, "reward_std": 0.5851132869720459, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -4.178466796875, "rewards/ppl_reward/std": 1.2127025127410889, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1620931327342987, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 146.03125, "completions/mean_terminated_length": 146.03125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.9625342674383184, "grad_norm": 1.6460984945297241, "kl": 5.6171875, "learning_rate": 8.524010993287364e-06, "loss": 0.2747, "num_tokens": 44139653.0, "reward": -1.7620849609375, "reward_std": 0.6128594875335693, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.211669921875, "rewards/ppl_reward/std": 4.642149448394775, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09449111670255661, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 156.609375, "completions/mean_terminated_length": 142.84127807617188, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 2.9637526652452024, "grad_norm": 3.4253411293029785, "kl": 9.4453125, "learning_rate": 8.515590977107597e-06, "loss": 0.5263, "num_tokens": 44156708.0, "reward": -1.1229248046875, "reward_std": 1.0249686241149902, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.886474609375, "rewards/ppl_reward/std": 5.470332622528076, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16347388923168182, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 138.09375, "completions/mean_terminated_length": 138.09375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 2.9649710630520865, "grad_norm": 1.34687340259552, "kl": 3.83984375, "learning_rate": 8.507172036898845e-06, "loss": 0.1544, "num_tokens": 44172042.0, "reward": -3.908935546875, "reward_std": 0.8207844495773315, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -11.45849609375, "rewards/ppl_reward/std": 10.795086860656738, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 135.546875, "completions/mean_terminated_length": 135.546875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.9661894608589705, "grad_norm": 2.061617136001587, "kl": 5.6015625, "learning_rate": 8.498754178763564e-06, "loss": 0.1866, "num_tokens": 44187453.0, "reward": -1.9329833984375, "reward_std": 2.008971691131592, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.475341796875, "rewards/ppl_reward/std": 7.141733646392822, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.23345555365085602, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 148.84375, "completions/mean_terminated_length": 148.84375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.9674078586658545, "grad_norm": 2.493626594543457, "kl": 6.15625, "learning_rate": 8.490337408803415e-06, "loss": 0.2128, "num_tokens": 44204499.0, "reward": -1.066650390625, "reward_std": 0.7374671697616577, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.70361328125, "rewards/ppl_reward/std": 1.9794387817382812, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1526367962360382, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 167.25, "completions/mean_terminated_length": 167.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.9686262564727386, "grad_norm": 1.5660316944122314, "kl": 2.3798828125, "learning_rate": 8.481921733119286e-06, "loss": 0.1127, "num_tokens": 44223555.0, "reward": -0.87451171875, "reward_std": 0.6778543591499329, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.5771484375, "rewards/ppl_reward/std": 3.2385852336883545, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 152.328125, "completions/mean_terminated_length": 152.328125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.969844654279622, "grad_norm": 1.893506407737732, "kl": 5.65625, "learning_rate": 8.473507157811254e-06, "loss": 0.1715, "num_tokens": 44240936.0, "reward": -0.326904296875, "reward_std": 0.5340487957000732, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.23193359375, "rewards/ppl_reward/std": 1.4572705030441284, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 139.4375, "completions/mean_terminated_length": 139.4375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.9710630520865062, "grad_norm": 1.84852933883667, "kl": 5.00390625, "learning_rate": 8.465093688978619e-06, "loss": 0.1491, "num_tokens": 44256684.0, "reward": -1.03216552734375, "reward_std": 0.49325695633888245, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.7205810546875, "rewards/ppl_reward/std": 3.1048033237457275, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14689241349697113, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 135.984375, "completions/mean_terminated_length": 135.984375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.9722814498933903, "grad_norm": 1.4016155004501343, "kl": 3.033203125, "learning_rate": 8.456681332719856e-06, "loss": 0.0024, "num_tokens": 44271971.0, "reward": -0.2530517578125, "reward_std": 0.5358761548995972, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.217041015625, "rewards/ppl_reward/std": 1.5509397983551025, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 137.515625, "completions/mean_terminated_length": 137.515625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 2.973499847700274, "grad_norm": 2.2655959129333496, "kl": 4.306640625, "learning_rate": 8.448270095132652e-06, "loss": 0.1716, "num_tokens": 44287780.0, "reward": -2.6402587890625, "reward_std": 0.8176959753036499, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -9.007080078125, "rewards/ppl_reward/std": 7.130382537841797, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.17567719519138336, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 137.984375, "completions/mean_terminated_length": 137.984375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.974718245507158, "grad_norm": 2.072575092315674, "kl": 5.0625, "learning_rate": 8.439859982313873e-06, "loss": 0.2367, "num_tokens": 44303107.0, "reward": -1.03533935546875, "reward_std": 0.655427873134613, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.8206787109375, "rewards/ppl_reward/std": 2.416152000427246, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 139.90625, "completions/mean_terminated_length": 139.90625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 2.975936643314042, "grad_norm": 1.834619164466858, "kl": 3.57421875, "learning_rate": 8.431451000359575e-06, "loss": 0.0459, "num_tokens": 44319093.0, "reward": -2.1318359375, "reward_std": 1.1101953983306885, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.904296875, "rewards/ppl_reward/std": 7.7253570556640625, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 156.9375, "completions/mean_terminated_length": 156.9375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 2.977155041120926, "grad_norm": 2.1125621795654297, "kl": 5.97265625, "learning_rate": 8.423043155364994e-06, "loss": 0.1548, "num_tokens": 44336865.0, "reward": -0.9932861328125, "reward_std": 0.8466163873672485, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.541259765625, "rewards/ppl_reward/std": 3.4465904235839844, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15545432269573212, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 146.359375, "completions/mean_terminated_length": 146.359375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 2.97837343892781, "grad_norm": 1.5833771228790283, "kl": 2.8212890625, "learning_rate": 8.414636453424535e-06, "loss": 0.0665, "num_tokens": 44353624.0, "reward": -1.33154296875, "reward_std": 1.0471049547195435, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.3583984375, "rewards/ppl_reward/std": 5.0937819480896, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 127.28125, "completions/mean_terminated_length": 127.28125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 2.979591836734694, "grad_norm": 2.1441633701324463, "kl": 5.5390625, "learning_rate": 8.406230900631784e-06, "loss": 0.1345, "num_tokens": 44368714.0, "reward": -0.2781982421875, "reward_std": 0.885021448135376, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.095458984375, "rewards/ppl_reward/std": 1.7451751232147217, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.23302355408668518, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 140.265625, "completions/mean_terminated_length": 140.265625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 2.9808102345415777, "grad_norm": 1.7413859367370605, "kl": 3.001953125, "learning_rate": 8.397826503079489e-06, "loss": 0.0243, "num_tokens": 44384891.0, "reward": -1.285888671875, "reward_std": 0.5890042185783386, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.32177734375, "rewards/ppl_reward/std": 2.4467697143554688, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 144.484375, "completions/mean_terminated_length": 144.484375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 2.9820286323484617, "grad_norm": 1.3212342262268066, "kl": 1.7763671875, "learning_rate": 8.389423266859564e-06, "loss": -0.0075, "num_tokens": 44401234.0, "reward": -0.7451171875, "reward_std": 0.4576528072357178, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.333984375, "rewards/ppl_reward/std": 3.019866704940796, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 137.984375, "completions/mean_terminated_length": 137.984375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 2.9832470301553458, "grad_norm": 4.338991641998291, "kl": 8.08984375, "learning_rate": 8.381021198063073e-06, "loss": 0.2703, "num_tokens": 44416369.0, "reward": -1.0323486328125, "reward_std": 1.3565293550491333, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.502197265625, "rewards/ppl_reward/std": 3.5355799198150635, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.24193336069583893, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 141.703125, "completions/mean_terminated_length": 141.703125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 2.98446542796223, "grad_norm": 3.7322380542755127, "kl": 5.9140625, "learning_rate": 8.37262030278024e-06, "loss": 0.2936, "num_tokens": 44432990.0, "reward": -1.50115966796875, "reward_std": 0.6981683969497681, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.6507568359375, "rewards/ppl_reward/std": 7.558199882507324, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17354662716388702, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 153.90625, "completions/mean_terminated_length": 153.90625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 2.9856838257691134, "grad_norm": 2.567166328430176, "kl": 4.458984375, "learning_rate": 8.364220587100442e-06, "loss": 0.1266, "num_tokens": 44449840.0, "reward": -1.645751953125, "reward_std": 0.8800044059753418, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.92431640625, "rewards/ppl_reward/std": 4.957531452178955, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 150.4375, "completions/mean_terminated_length": 150.4375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 2.9869022235759974, "grad_norm": 1.4829912185668945, "kl": 3.591796875, "learning_rate": 8.355822057112192e-06, "loss": 0.0879, "num_tokens": 44466220.0, "reward": -0.9285888671875, "reward_std": 0.7415907979011536, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.513427734375, "rewards/ppl_reward/std": 3.259364366531372, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 138.78125, "completions/mean_terminated_length": 138.78125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.9881206213828815, "grad_norm": 1.2878185510635376, "kl": 2.318359375, "learning_rate": 8.347424718903152e-06, "loss": 0.0292, "num_tokens": 44481678.0, "reward": -1.48828125, "reward_std": 0.6059063673019409, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.7265625, "rewards/ppl_reward/std": 5.789213180541992, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 2451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 140.109375, "completions/mean_terminated_length": 140.109375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 2.9893390191897655, "grad_norm": 2.2763073444366455, "kl": 3.40625, "learning_rate": 8.339028578560114e-06, "loss": 0.177, "num_tokens": 44496957.0, "reward": -1.945556640625, "reward_std": 2.4351940155029297, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.60205078125, "rewards/ppl_reward/std": 12.521173477172852, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 156.1875, "completions/mean_terminated_length": 156.1875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 2.9905574169966496, "grad_norm": 1.7024637460708618, "kl": 4.5546875, "learning_rate": 8.330633642169004e-06, "loss": 0.2041, "num_tokens": 44513865.0, "reward": -0.73199462890625, "reward_std": 0.5589797496795654, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.1280517578125, "rewards/ppl_reward/std": 2.885885715484619, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.2028672844171524, "step": 2453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 143.21875, "completions/mean_terminated_length": 143.21875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.9917758148035336, "grad_norm": 1.5248838663101196, "kl": 3.32421875, "learning_rate": 8.32223991581488e-06, "loss": 0.1074, "num_tokens": 44529983.0, "reward": -4.85345458984375, "reward_std": 0.7908927202224731, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -13.3944091796875, "rewards/ppl_reward/std": 18.36359214782715, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 2454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 168.015625, "completions/mean_terminated_length": 168.015625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.992994212610417, "grad_norm": 1.8417022228240967, "kl": 3.5234375, "learning_rate": 8.313847405581917e-06, "loss": 0.1344, "num_tokens": 44548200.0, "reward": -1.5291748046875, "reward_std": 0.5455106496810913, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.847412109375, "rewards/ppl_reward/std": 3.480722188949585, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 166.734375, "completions/mean_terminated_length": 166.734375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.9942126104173012, "grad_norm": 1.8941011428833008, "kl": 3.978515625, "learning_rate": 8.305456117553403e-06, "loss": 0.115, "num_tokens": 44566871.0, "reward": -0.15289306640625, "reward_std": 0.5176593065261841, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.0167236328125, "rewards/ppl_reward/std": 1.7714098691940308, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1846257895231247, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 161.3125, "completions/mean_terminated_length": 161.3125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 2.9954310082241853, "grad_norm": 1.758811593055725, "kl": 5.654296875, "learning_rate": 8.29706605781175e-06, "loss": 0.2497, "num_tokens": 44584651.0, "reward": -1.25830078125, "reward_std": 0.7813698053359985, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.2275390625, "rewards/ppl_reward/std": 3.717771291732788, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 2457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 153.453125, "completions/mean_terminated_length": 153.453125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.996649406031069, "grad_norm": 1.6535089015960693, "kl": 3.798828125, "learning_rate": 8.288677232438481e-06, "loss": 0.249, "num_tokens": 44602000.0, "reward": -0.710693359375, "reward_std": 0.29055994749069214, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.22607421875, "rewards/ppl_reward/std": 1.5568970441818237, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 2458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 154.75, "completions/mean_terminated_length": 154.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.997867803837953, "grad_norm": 2.2607879638671875, "kl": 4.71484375, "learning_rate": 8.280289647514215e-06, "loss": 0.1401, "num_tokens": 44618936.0, "reward": -0.676025390625, "reward_std": 0.5392867922782898, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.95361328125, "rewards/ppl_reward/std": 3.1433093547821045, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1876239776611328, "step": 2459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 136.43243408203125, "completions/mean_terminated_length": 136.43243408203125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 2.999086201644837, "grad_norm": 1.4146506786346436, "kl": 2.224609375, "learning_rate": 8.27190330911868e-06, "loss": 0.0543, "num_tokens": 44635529.0, "reward": -2.4749755859375, "reward_std": 2.5432052612304688, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.715576171875, "rewards/ppl_reward/std": 15.991893768310547, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 161.3125, "completions/mean_terminated_length": 161.3125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 3.001218397806884, "grad_norm": 2.0858664512634277, "kl": 4.5078125, "learning_rate": 8.263518223330698e-06, "loss": 0.1895, "num_tokens": 44653557.0, "reward": -0.6126708984375, "reward_std": 0.4744434952735901, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.928466796875, "rewards/ppl_reward/std": 2.818650722503662, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 2461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 144.5, "completions/mean_terminated_length": 144.5, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 3.002436795613768, "grad_norm": 2.0059337615966797, "kl": 5.80859375, "learning_rate": 8.255134396228177e-06, "loss": 0.1575, "num_tokens": 44670309.0, "reward": -0.8970947265625, "reward_std": 0.6935896873474121, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.333251953125, "rewards/ppl_reward/std": 3.007087230682373, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1224314495921135, "step": 2462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 151.765625, "completions/mean_terminated_length": 151.765625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 3.0036551934206517, "grad_norm": 1.8437693119049072, "kl": 4.89453125, "learning_rate": 8.246751833888122e-06, "loss": 0.2353, "num_tokens": 44686726.0, "reward": -0.9964599609375, "reward_std": 0.8635796308517456, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.602294921875, "rewards/ppl_reward/std": 4.076677322387695, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 2463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 145.78125, "completions/mean_terminated_length": 145.78125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 3.0048735912275357, "grad_norm": 1.5176746845245361, "kl": 3.26953125, "learning_rate": 8.23837054238662e-06, "loss": 0.0816, "num_tokens": 44702760.0, "reward": -1.0906982421875, "reward_std": 0.669249951839447, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.907958984375, "rewards/ppl_reward/std": 4.827428817749023, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1224314495921135, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 153.78125, "completions/mean_terminated_length": 153.78125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.0060919890344198, "grad_norm": 1.3992054462432861, "kl": 4.38671875, "learning_rate": 8.229990527798828e-06, "loss": 0.1667, "num_tokens": 44719658.0, "reward": -3.532958984375, "reward_std": 1.5832034349441528, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -10.77685546875, "rewards/ppl_reward/std": 9.596551895141602, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10789459943771362, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 140.203125, "completions/mean_terminated_length": 140.203125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 3.007310386841304, "grad_norm": 1.554325819015503, "kl": 3.2021484375, "learning_rate": 8.221611796198984e-06, "loss": 0.0525, "num_tokens": 44735511.0, "reward": -0.982666015625, "reward_std": 1.0134373903274536, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.74658203125, "rewards/ppl_reward/std": 5.136945724487305, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 2466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 154.625, "completions/mean_terminated_length": 154.625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 3.008528784648188, "grad_norm": 1.8721171617507935, "kl": 3.73828125, "learning_rate": 8.2132343536604e-06, "loss": 0.2223, "num_tokens": 44752223.0, "reward": -1.12127685546875, "reward_std": 0.6429640650749207, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.9144287109375, "rewards/ppl_reward/std": 3.6207430362701416, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 2467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 143.921875, "completions/mean_terminated_length": 143.921875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 3.0097471824550714, "grad_norm": 1.3061432838439941, "kl": 3.033203125, "learning_rate": 8.204858206255443e-06, "loss": -0.0368, "num_tokens": 44768570.0, "reward": -1.6458740234375, "reward_std": 1.368116855621338, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.807373046875, "rewards/ppl_reward/std": 4.334970474243164, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.23239074647426605, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 158.359375, "completions/mean_terminated_length": 158.359375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.0109655802619555, "grad_norm": 2.1521782875061035, "kl": 6.21875, "learning_rate": 8.196483360055551e-06, "loss": 0.1689, "num_tokens": 44785969.0, "reward": -1.06103515625, "reward_std": 0.624524712562561, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.5986328125, "rewards/ppl_reward/std": 2.755829095840454, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1763816624879837, "step": 2469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 140.71875, "completions/mean_terminated_length": 140.71875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.0121839780688395, "grad_norm": 1.9221858978271484, "kl": 6.12109375, "learning_rate": 8.188109821131217e-06, "loss": 0.2045, "num_tokens": 44801447.0, "reward": -1.0850830078125, "reward_std": 1.20074462890625, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.646728515625, "rewards/ppl_reward/std": 4.587850093841553, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1767328530550003, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 142.96875, "completions/mean_terminated_length": 142.96875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 3.0134023758757236, "grad_norm": 3.6373116970062256, "kl": 7.78125, "learning_rate": 8.179737595551979e-06, "loss": 0.3408, "num_tokens": 44817501.0, "reward": -0.2689208984375, "reward_std": 0.8272190093994141, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -3.944091796875, "rewards/ppl_reward/std": 2.5979063510894775, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.20653989911079407, "step": 2471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 140.484375, "completions/mean_terminated_length": 140.484375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 3.014620773682607, "grad_norm": 2.2465431690216064, "kl": 4.7890625, "learning_rate": 8.171366689386433e-06, "loss": 0.1414, "num_tokens": 44833188.0, "reward": -7.143310546875, "reward_std": 3.4045376777648926, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -17.78662109375, "rewards/ppl_reward/std": 28.9711856842041, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.25, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.015839171489491, "grad_norm": 1.7194082736968994, "kl": 5.0546875, "learning_rate": 8.162997108702215e-06, "loss": 0.1561, "num_tokens": 44850084.0, "reward": -2.9522705078125, "reward_std": 1.1203551292419434, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -9.490478515625, "rewards/ppl_reward/std": 8.983149528503418, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 2473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 141.875, "completions/mean_terminated_length": 127.8730239868164, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 3.0170575692963753, "grad_norm": 5.9160661697387695, "kl": 8.546875, "learning_rate": 8.154628859565995e-06, "loss": 0.275, "num_tokens": 44865628.0, "reward": -8.7255859375, "reward_std": 10.306171417236328, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -20.998046875, "rewards/ppl_reward/std": 58.78624725341797, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 2474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 136.234375, "completions/mean_terminated_length": 136.234375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.0182759671032593, "grad_norm": 1.580196738243103, "kl": 4.4609375, "learning_rate": 8.146261948043485e-06, "loss": 0.0709, "num_tokens": 44881835.0, "reward": -0.639404296875, "reward_std": 1.1203316450119019, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -4.70849609375, "rewards/ppl_reward/std": 3.0762758255004883, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2052978277206421, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 138.59375, "completions/mean_terminated_length": 138.59375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.0194943649101433, "grad_norm": 1.9490094184875488, "kl": 2.626953125, "learning_rate": 8.137896380199422e-06, "loss": 0.0491, "num_tokens": 44897857.0, "reward": -2.2568359375, "reward_std": 1.0193992853164673, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -8.255859375, "rewards/ppl_reward/std": 6.078309535980225, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1791718602180481, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 152.796875, "completions/mean_terminated_length": 152.796875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.020712762717027, "grad_norm": 3.5527868270874023, "kl": 6.94140625, "learning_rate": 8.129532162097576e-06, "loss": 0.1773, "num_tokens": 44915068.0, "reward": -1.4764404296875, "reward_std": 1.0037906169891357, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.327880859375, "rewards/ppl_reward/std": 2.8391709327697754, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.22493386268615723, "step": 2477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 150.53125, "completions/mean_terminated_length": 150.53125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 3.021931160523911, "grad_norm": 1.5467127561569214, "kl": 2.98828125, "learning_rate": 8.121169299800732e-06, "loss": -0.0351, "num_tokens": 44932990.0, "reward": -2.50848388671875, "reward_std": 2.2394003868103027, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.5872802734375, "rewards/ppl_reward/std": 9.96707820892334, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.20740120112895966, "step": 2478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 139.828125, "completions/mean_terminated_length": 139.828125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 3.023149558330795, "grad_norm": 2.3144779205322266, "kl": 4.00390625, "learning_rate": 8.112807799370692e-06, "loss": 0.0888, "num_tokens": 44948475.0, "reward": -1.58038330078125, "reward_std": 1.0514824390411377, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.8560791015625, "rewards/ppl_reward/std": 8.700212478637695, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 2479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 124.84375, "completions/mean_terminated_length": 124.84375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.024367956137679, "grad_norm": 1.8232113122940063, "kl": 3.7734375, "learning_rate": 8.10444766686828e-06, "loss": 0.1185, "num_tokens": 44963137.0, "reward": -0.76580810546875, "reward_std": 0.4159989058971405, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.2738037109375, "rewards/ppl_reward/std": 3.519803762435913, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 142.78125, "completions/mean_terminated_length": 142.78125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 3.025586353944563, "grad_norm": 1.8281776905059814, "kl": 2.6328125, "learning_rate": 8.096088908353316e-06, "loss": 0.0157, "num_tokens": 44978651.0, "reward": -1.815185546875, "reward_std": 0.8947359919548035, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.36474609375, "rewards/ppl_reward/std": 7.079478740692139, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 2481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 146.5625, "completions/mean_terminated_length": 146.5625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.0268047517514467, "grad_norm": 2.1968448162078857, "kl": 5.1484375, "learning_rate": 8.087731529884635e-06, "loss": 0.1959, "num_tokens": 44995231.0, "reward": -3.81591796875, "reward_std": 0.9753211736679077, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -11.1474609375, "rewards/ppl_reward/std": 10.982832908630371, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 2482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 142.109375, "completions/mean_terminated_length": 142.109375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.0280231495583307, "grad_norm": 1.3992385864257812, "kl": 3.9609375, "learning_rate": 8.079375537520062e-06, "loss": 0.0746, "num_tokens": 45011286.0, "reward": -1.10302734375, "reward_std": 1.0867581367492676, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.7763671875, "rewards/ppl_reward/std": 4.247224807739258, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.2076999396085739, "step": 2483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 160.53125, "completions/mean_terminated_length": 160.53125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.029241547365215, "grad_norm": 2.339560031890869, "kl": 2.48828125, "learning_rate": 8.071020937316423e-06, "loss": 0.0316, "num_tokens": 45029040.0, "reward": -1.838623046875, "reward_std": 0.8152008056640625, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.37255859375, "rewards/ppl_reward/std": 4.5327324867248535, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18225978314876556, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 136.234375, "completions/mean_terminated_length": 136.234375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.030459945172099, "grad_norm": 1.9405596256256104, "kl": 2.298828125, "learning_rate": 8.062667735329533e-06, "loss": -0.0208, "num_tokens": 45044175.0, "reward": -2.8609619140625, "reward_std": 1.4743266105651855, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -9.378173828125, "rewards/ppl_reward/std": 7.590912342071533, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.2136233150959015, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 154.109375, "completions/mean_terminated_length": 154.109375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 3.0316783429789824, "grad_norm": 2.3078725337982178, "kl": 1.919921875, "learning_rate": 8.0543159376142e-06, "loss": 0.048, "num_tokens": 45060990.0, "reward": -0.9478759765625, "reward_std": 0.33055174350738525, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.762939453125, "rewards/ppl_reward/std": 1.978225827217102, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 2486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 154.421875, "completions/mean_terminated_length": 154.421875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.0328967407858665, "grad_norm": 1.4436253309249878, "kl": 2.3203125, "learning_rate": 8.045965550224201e-06, "loss": -0.0062, "num_tokens": 45077761.0, "reward": -1.5010986328125, "reward_std": 0.9899603128433228, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.760009765625, "rewards/ppl_reward/std": 3.4338021278381348, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 2487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 160.703125, "completions/mean_terminated_length": 160.703125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.0341151385927505, "grad_norm": 3.150387763977051, "kl": 3.732421875, "learning_rate": 8.037616579212301e-06, "loss": 0.1414, "num_tokens": 45095270.0, "reward": -2.292724609375, "reward_std": 1.0505741834640503, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.24951171875, "rewards/ppl_reward/std": 5.769213676452637, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13768701255321503, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 159.171875, "completions/mean_terminated_length": 159.171875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.0353335363996345, "grad_norm": 1.5613245964050293, "kl": 2.5546875, "learning_rate": 8.029269030630237e-06, "loss": 0.1356, "num_tokens": 45113041.0, "reward": -0.4488525390625, "reward_std": 0.4319162964820862, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.561767578125, "rewards/ppl_reward/std": 1.604061484336853, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 2489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 154.859375, "completions/mean_terminated_length": 154.859375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.0365519342065186, "grad_norm": 1.5509438514709473, "kl": 4.685546875, "learning_rate": 8.020922910528717e-06, "loss": 0.195, "num_tokens": 45129840.0, "reward": -1.1422119140625, "reward_std": 0.5663852691650391, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.018798828125, "rewards/ppl_reward/std": 2.669940710067749, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 155.46875, "completions/mean_terminated_length": 155.46875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.037770332013402, "grad_norm": 1.507186770439148, "kl": 4.9765625, "learning_rate": 8.0125782249574e-06, "loss": 0.1687, "num_tokens": 45146774.0, "reward": -1.2701416015625, "reward_std": 0.6366376876831055, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.040283203125, "rewards/ppl_reward/std": 4.6572585105896, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2083333432674408, "step": 2491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 150.65625, "completions/mean_terminated_length": 150.65625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.0389887298202862, "grad_norm": 1.4476603269577026, "kl": 4.490234375, "learning_rate": 8.004234979964922e-06, "loss": 0.1098, "num_tokens": 45163536.0, "reward": -0.43121337890625, "reward_std": 0.6219042539596558, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.4171142578125, "rewards/ppl_reward/std": 2.0092897415161133, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 148.78125, "completions/mean_terminated_length": 148.78125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.0402071276271703, "grad_norm": 1.5786265134811401, "kl": 2.6162109375, "learning_rate": 7.995893181598866e-06, "loss": 0.08, "num_tokens": 45180082.0, "reward": -0.912353515625, "reward_std": 0.4988289475440979, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.66845703125, "rewards/ppl_reward/std": 3.03020977973938, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 2493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 156.921875, "completions/mean_terminated_length": 156.921875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.0414255254340543, "grad_norm": 1.7587599754333496, "kl": 3.73046875, "learning_rate": 7.98755283590577e-06, "loss": 0.0717, "num_tokens": 45197333.0, "reward": -1.1072998046875, "reward_std": 0.5411865711212158, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.058349609375, "rewards/ppl_reward/std": 6.570977687835693, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 2494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 151.4375, "completions/mean_terminated_length": 151.4375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 3.0426439232409384, "grad_norm": 1.525164008140564, "kl": 2.3076171875, "learning_rate": 7.979213948931118e-06, "loss": 0.0989, "num_tokens": 45213793.0, "reward": -0.0826416015625, "reward_std": 0.21227499842643738, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -4.071533203125, "rewards/ppl_reward/std": 1.440896987915039, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 154.453125, "completions/mean_terminated_length": 154.453125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.043862321047822, "grad_norm": 2.7746148109436035, "kl": 4.134765625, "learning_rate": 7.970876526719333e-06, "loss": 0.1443, "num_tokens": 45231070.0, "reward": -2.4249267578125, "reward_std": 0.5451978445053101, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.584228515625, "rewards/ppl_reward/std": 9.920768737792969, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15570341050624847, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 145.765625, "completions/mean_terminated_length": 145.765625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.045080718854706, "grad_norm": 1.6660709381103516, "kl": 3.5302734375, "learning_rate": 7.962540575313782e-06, "loss": 0.0742, "num_tokens": 45248223.0, "reward": -1.932861328125, "reward_std": 0.9017876982688904, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.61572265625, "rewards/ppl_reward/std": 5.483793258666992, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 2497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 142.09375, "completions/mean_terminated_length": 142.09375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 3.04629911666159, "grad_norm": 2.407040596008301, "kl": 6.650390625, "learning_rate": 7.954206100756763e-06, "loss": 0.2224, "num_tokens": 45263973.0, "reward": -1.6083984375, "reward_std": 0.8239640593528748, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.794921875, "rewards/ppl_reward/std": 4.132216930389404, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19654127955436707, "step": 2498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 145.984375, "completions/mean_terminated_length": 145.984375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 3.047517514468474, "grad_norm": 3.515179395675659, "kl": 7.875, "learning_rate": 7.945873109089503e-06, "loss": 0.2951, "num_tokens": 45280324.0, "reward": -1.2049560546875, "reward_std": 1.2123504877090454, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.011474609375, "rewards/ppl_reward/std": 3.1229324340820312, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1767328530550003, "step": 2499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 137.09375, "completions/mean_terminated_length": 137.09375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.048735912275358, "grad_norm": 1.295095682144165, "kl": 2.630859375, "learning_rate": 7.937541606352158e-06, "loss": -0.0219, "num_tokens": 45296338.0, "reward": -1.3050537109375, "reward_std": 0.6618903279304504, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.227294921875, "rewards/ppl_reward/std": 4.107434272766113, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.14211894571781158, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 147.921875, "completions/mean_terminated_length": 147.921875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 3.0499543100822417, "grad_norm": 1.3888171911239624, "kl": 3.611328125, "learning_rate": 7.929211598583795e-06, "loss": 0.1856, "num_tokens": 45312717.0, "reward": -2.7247314453125, "reward_std": 0.4906569719314575, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -9.277587890625, "rewards/ppl_reward/std": 5.306159019470215, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 2501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 141.921875, "completions/mean_terminated_length": 141.921875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 3.0511727078891258, "grad_norm": 1.7880836725234985, "kl": 5.1572265625, "learning_rate": 7.92088309182241e-06, "loss": 0.1803, "num_tokens": 45328712.0, "reward": -0.75927734375, "reward_std": 0.5953707098960876, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.1748046875, "rewards/ppl_reward/std": 2.70184063911438, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19920477271080017, "step": 2502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 137.78125, "completions/mean_terminated_length": 137.78125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.05239110569601, "grad_norm": 2.785574436187744, "kl": 7.3046875, "learning_rate": 7.9125560921049e-06, "loss": 0.2462, "num_tokens": 45344090.0, "reward": -0.408447265625, "reward_std": 1.1590861082077026, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.27783203125, "rewards/ppl_reward/std": 3.3830747604370117, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.20918723940849304, "step": 2503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 137.9375, "completions/mean_terminated_length": 137.9375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 3.053609503502894, "grad_norm": 1.868531346321106, "kl": 4.345703125, "learning_rate": 7.904230605467079e-06, "loss": 0.0846, "num_tokens": 45359582.0, "reward": -0.6212158203125, "reward_std": 0.5992865562438965, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.906494140625, "rewards/ppl_reward/std": 1.8776116371154785, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13768701255321503, "step": 2504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 143.171875, "completions/mean_terminated_length": 143.171875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.0548279013097774, "grad_norm": 1.9707484245300293, "kl": 5.015625, "learning_rate": 7.89590663794366e-06, "loss": 0.1395, "num_tokens": 45375769.0, "reward": -2.08935546875, "reward_std": 1.47694730758667, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.6787109375, "rewards/ppl_reward/std": 6.622119903564453, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1985812783241272, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 156.875, "completions/mean_terminated_length": 156.875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 3.0560462991166615, "grad_norm": 2.172727108001709, "kl": 7.48046875, "learning_rate": 7.887584195568252e-06, "loss": 0.3015, "num_tokens": 45393105.0, "reward": -2.7957763671875, "reward_std": 0.9916830062866211, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -9.154052734375, "rewards/ppl_reward/std": 7.18112325668335, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18898223340511322, "step": 2506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 159.234375, "completions/mean_terminated_length": 159.234375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.0572646969235455, "grad_norm": 2.405397415161133, "kl": 4.5458984375, "learning_rate": 7.879263284373363e-06, "loss": 0.3228, "num_tokens": 45410424.0, "reward": -0.55908203125, "reward_std": 0.2642999291419983, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.9931640625, "rewards/ppl_reward/std": 1.5644776821136475, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 2507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 150.984375, "completions/mean_terminated_length": 150.984375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 3.0584830947304296, "grad_norm": 2.004310369491577, "kl": 2.4423828125, "learning_rate": 7.870943910390392e-06, "loss": 0.1068, "num_tokens": 45428111.0, "reward": -1.2236328125, "reward_std": 0.2933429777622223, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.369140625, "rewards/ppl_reward/std": 4.687215328216553, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 2508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 142.34375, "completions/mean_terminated_length": 142.34375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 3.0597014925373136, "grad_norm": 1.7627055644989014, "kl": 4.51171875, "learning_rate": 7.862626079649618e-06, "loss": 0.1103, "num_tokens": 45444573.0, "reward": -5.43408203125, "reward_std": 5.778337478637695, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -14.5244140625, "rewards/ppl_reward/std": 33.86238479614258, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.20412415266036987, "step": 2509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 139.09375, "completions/mean_terminated_length": 139.09375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 3.060919890344197, "grad_norm": 2.9297568798065186, "kl": 6.0, "learning_rate": 7.854309798180202e-06, "loss": 0.2063, "num_tokens": 45461035.0, "reward": -0.5604248046875, "reward_std": 0.5909650325775146, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.714599609375, "rewards/ppl_reward/std": 1.4650019407272339, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 153.671875, "completions/mean_terminated_length": 153.671875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 3.0621382881510812, "grad_norm": 2.510063648223877, "kl": 3.66015625, "learning_rate": 7.845995072010188e-06, "loss": 0.1315, "num_tokens": 45478046.0, "reward": -0.1943359375, "reward_std": 0.5312103629112244, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.193359375, "rewards/ppl_reward/std": 2.284552574157715, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 2511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 140.984375, "completions/mean_terminated_length": 140.984375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.0633566859579653, "grad_norm": 3.8008460998535156, "kl": 7.1953125, "learning_rate": 7.83768190716649e-06, "loss": 0.2212, "num_tokens": 45493581.0, "reward": -1.2132568359375, "reward_std": 0.6822183728218079, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -5.793701171875, "rewards/ppl_reward/std": 2.754145860671997, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.16699187457561493, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 160.34375, "completions/mean_terminated_length": 160.34375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.0645750837648493, "grad_norm": 1.3709619045257568, "kl": 5.060546875, "learning_rate": 7.829370309674886e-06, "loss": 0.2057, "num_tokens": 45511931.0, "reward": -1.6898193359375, "reward_std": 0.7472479343414307, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.965576171875, "rewards/ppl_reward/std": 4.230887413024902, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18496133387088776, "step": 2513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 140.03125, "completions/mean_terminated_length": 140.03125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.0657934815717334, "grad_norm": 1.6797072887420654, "kl": 5.4765625, "learning_rate": 7.821060285560024e-06, "loss": 0.2217, "num_tokens": 45527533.0, "reward": -1.2489013671875, "reward_std": 1.232875108718872, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.075927734375, "rewards/ppl_reward/std": 5.126326084136963, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.14412261545658112, "step": 2514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 146.609375, "completions/mean_terminated_length": 132.6825408935547, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 3.067011879378617, "grad_norm": 4.324069499969482, "kl": 10.3203125, "learning_rate": 7.81275184084541e-06, "loss": 0.4595, "num_tokens": 45543020.0, "reward": -2.5328369140625, "reward_std": 2.2966511249542236, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -8.393798828125, "rewards/ppl_reward/std": 8.256896018981934, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.25865477323532104, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 136.625, "completions/mean_terminated_length": 136.625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 3.068230277185501, "grad_norm": 1.350854754447937, "kl": 4.13671875, "learning_rate": 7.8044449815534e-06, "loss": 0.049, "num_tokens": 45558548.0, "reward": -2.9229736328125, "reward_std": 1.173764705657959, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -9.494384765625, "rewards/ppl_reward/std": 5.265584468841553, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 2516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 143.046875, "completions/mean_terminated_length": 143.046875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.069448674992385, "grad_norm": 1.3562390804290771, "kl": 2.6650390625, "learning_rate": 7.796139713705214e-06, "loss": -0.0296, "num_tokens": 45574647.0, "reward": -2.6630859375, "reward_std": 1.282191276550293, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.990234375, "rewards/ppl_reward/std": 7.93948221206665, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14471296966075897, "step": 2517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 179.046875, "completions/mean_terminated_length": 165.6349334716797, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.070667072799269, "grad_norm": 2.664839029312134, "kl": 5.47265625, "learning_rate": 7.787836043320899e-06, "loss": 0.3064, "num_tokens": 45592546.0, "reward": -6.2564697265625, "reward_std": 2.4437379837036133, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -16.208251953125, "rewards/ppl_reward/std": 23.839879989624023, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 2518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 151.328125, "completions/mean_terminated_length": 151.328125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 3.071885470606153, "grad_norm": 3.4001541137695312, "kl": 3.611328125, "learning_rate": 7.779533976419357e-06, "loss": 0.185, "num_tokens": 45608863.0, "reward": -0.2772216796875, "reward_std": 0.5916948318481445, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.257568359375, "rewards/ppl_reward/std": 2.362204074859619, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13524486124515533, "step": 2519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 143.421875, "completions/mean_terminated_length": 143.421875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 3.0731038684130367, "grad_norm": 1.5481234788894653, "kl": 3.615234375, "learning_rate": 7.771233519018327e-06, "loss": 0.1029, "num_tokens": 45625042.0, "reward": -0.6954345703125, "reward_std": 0.6242891550064087, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.961181640625, "rewards/ppl_reward/std": 1.833124041557312, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.17102740705013275, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 156.640625, "completions/mean_terminated_length": 156.640625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 3.0743222662199208, "grad_norm": 2.5755486488342285, "kl": 3.22265625, "learning_rate": 7.76293467713438e-06, "loss": 0.0828, "num_tokens": 45641955.0, "reward": -0.44012451171875, "reward_std": 0.44915658235549927, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.6614990234375, "rewards/ppl_reward/std": 2.086247682571411, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 2521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 152.5625, "completions/mean_terminated_length": 152.5625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 3.075540664026805, "grad_norm": 2.592432975769043, "kl": 3.91796875, "learning_rate": 7.754637456782911e-06, "loss": 0.1441, "num_tokens": 45658927.0, "reward": -0.20654296875, "reward_std": 0.5299490690231323, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.0458984375, "rewards/ppl_reward/std": 1.7195003032684326, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.15900352597236633, "step": 2522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 151.921875, "completions/mean_terminated_length": 151.921875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 3.076759061833689, "grad_norm": 7.217872142791748, "kl": 5.48046875, "learning_rate": 7.746341863978151e-06, "loss": 0.1842, "num_tokens": 45676458.0, "reward": -0.47705078125, "reward_std": 1.0301810503005981, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.5791015625, "rewards/ppl_reward/std": 3.002077579498291, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 2523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 154.96875, "completions/mean_terminated_length": 154.96875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.0779774596405725, "grad_norm": 3.022179365158081, "kl": 3.8046875, "learning_rate": 7.738047904733141e-06, "loss": 0.1384, "num_tokens": 45693472.0, "reward": -1.426025390625, "reward_std": 0.5545752048492432, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.50830078125, "rewards/ppl_reward/std": 2.8708765506744385, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14689241349697113, "step": 2524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 137.890625, "completions/mean_terminated_length": 137.890625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.0791958574474565, "grad_norm": 2.636439085006714, "kl": 3.87109375, "learning_rate": 7.729755585059741e-06, "loss": 0.1898, "num_tokens": 45708913.0, "reward": -0.68817138671875, "reward_std": 0.5552511215209961, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.0482177734375, "rewards/ppl_reward/std": 3.6276824474334717, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 151.296875, "completions/mean_terminated_length": 151.296875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.0804142552543405, "grad_norm": 2.0923612117767334, "kl": 5.6640625, "learning_rate": 7.721464910968628e-06, "loss": 0.1779, "num_tokens": 45725252.0, "reward": -4.047119140625, "reward_std": 2.0758461952209473, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -11.63330078125, "rewards/ppl_reward/std": 16.38131332397461, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.24138814210891724, "step": 2526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 140.34375, "completions/mean_terminated_length": 140.34375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.0816326530612246, "grad_norm": 2.200164794921875, "kl": 3.4609375, "learning_rate": 7.71317588846927e-06, "loss": 0.1519, "num_tokens": 45740722.0, "reward": -1.4486083984375, "reward_std": 0.694707453250885, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.639404296875, "rewards/ppl_reward/std": 3.3731467723846436, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 2527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 138.328125, "completions/mean_terminated_length": 138.328125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.0828510508681086, "grad_norm": 2.744699001312256, "kl": 5.45703125, "learning_rate": 7.704888523569959e-06, "loss": 0.1049, "num_tokens": 45756367.0, "reward": -0.6976318359375, "reward_std": 0.9900805950164795, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -4.668701171875, "rewards/ppl_reward/std": 2.056766986846924, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.22980836033821106, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 153.734375, "completions/mean_terminated_length": 153.734375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 3.084069448674992, "grad_norm": 2.686913251876831, "kl": 6.5390625, "learning_rate": 7.69660282227777e-06, "loss": 0.301, "num_tokens": 45773142.0, "reward": -1.334716796875, "reward_std": 1.0155363082885742, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.06005859375, "rewards/ppl_reward/std": 3.1079461574554443, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 2529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 167.734375, "completions/mean_terminated_length": 167.734375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.0852878464818763, "grad_norm": 3.6569836139678955, "kl": 8.609375, "learning_rate": 7.688318790598579e-06, "loss": 0.2996, "num_tokens": 45792093.0, "reward": -1.2994384765625, "reward_std": 1.0878691673278809, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -5.833251953125, "rewards/ppl_reward/std": 3.536789894104004, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.2596118450164795, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 142.828125, "completions/mean_terminated_length": 142.828125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.0865062442887603, "grad_norm": 2.6976537704467773, "kl": 8.7265625, "learning_rate": 7.680036434537054e-06, "loss": 0.4242, "num_tokens": 45808218.0, "reward": -2.02392578125, "reward_std": 2.0558888912200928, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -7.2744140625, "rewards/ppl_reward/std": 5.532499313354492, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.21271435916423798, "step": 2531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 124.890625, "completions/mean_terminated_length": 124.890625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 3.0877246420956443, "grad_norm": 4.115485668182373, "kl": 3.9765625, "learning_rate": 7.671755760096638e-06, "loss": 0.1794, "num_tokens": 45822651.0, "reward": -2.9859619140625, "reward_std": 1.3432871103286743, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -9.682861328125, "rewards/ppl_reward/std": 7.000710487365723, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1249379813671112, "step": 2532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 131.640625, "completions/mean_terminated_length": 131.640625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 3.0889430399025284, "grad_norm": 2.8300201892852783, "kl": 6.33984375, "learning_rate": 7.663476773279569e-06, "loss": 0.214, "num_tokens": 45837892.0, "reward": -1.0267333984375, "reward_std": 0.9166643619537354, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -5.381591796875, "rewards/ppl_reward/std": 1.8696658611297607, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.17938801646232605, "step": 2533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.090161437709412, "grad_norm": 2.45884108543396, "kl": 7.1171875, "learning_rate": 7.655199480086848e-06, "loss": 0.2665, "num_tokens": 45853868.0, "reward": -0.1072998046875, "reward_std": 0.7534191608428955, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -3.456787109375, "rewards/ppl_reward/std": 0.9608482122421265, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.2672322392463684, "step": 2534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 133.59375, "completions/mean_terminated_length": 133.59375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.091379835516296, "grad_norm": 1.9280638694763184, "kl": 4.9140625, "learning_rate": 7.646923886518263e-06, "loss": 0.1457, "num_tokens": 45869498.0, "reward": -1.0789794921875, "reward_std": 1.0399847030639648, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.665771484375, "rewards/ppl_reward/std": 1.8562568426132202, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2025614231824875, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 130.21875, "completions/mean_terminated_length": 130.21875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.09259823332318, "grad_norm": 1.7967417240142822, "kl": 4.9140625, "learning_rate": 7.638649998572353e-06, "loss": 0.1965, "num_tokens": 45884520.0, "reward": -2.7513427734375, "reward_std": 3.1541554927825928, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -8.869873046875, "rewards/ppl_reward/std": 14.758421897888184, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.18926911056041718, "step": 2536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 143.234375, "completions/mean_terminated_length": 143.234375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.093816631130064, "grad_norm": 2.012895107269287, "kl": 5.6484375, "learning_rate": 7.630377822246438e-06, "loss": 0.2455, "num_tokens": 45900983.0, "reward": -2.34619140625, "reward_std": 1.5029263496398926, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.2861328125, "rewards/ppl_reward/std": 5.496905326843262, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.20412415266036987, "step": 2537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 142.96875, "completions/mean_terminated_length": 128.984130859375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 3.0950350289369477, "grad_norm": 1.7213131189346313, "kl": 6.0078125, "learning_rate": 7.622107363536581e-06, "loss": 0.261, "num_tokens": 45917117.0, "reward": -2.318603515625, "reward_std": 2.6658122539520264, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.24658203125, "rewards/ppl_reward/std": 11.581572532653809, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.22479599714279175, "step": 2538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 123.8125, "completions/mean_terminated_length": 123.8125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.0962534267438317, "grad_norm": 2.061753273010254, "kl": 4.984375, "learning_rate": 7.613838628437611e-06, "loss": 0.1541, "num_tokens": 45931793.0, "reward": -1.652587890625, "reward_std": 0.8569878339767456, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.91455078125, "rewards/ppl_reward/std": 5.109502792358398, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 2539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 129.75, "completions/mean_terminated_length": 129.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.097471824550716, "grad_norm": 1.8561701774597168, "kl": 2.232421875, "learning_rate": 7.60557162294311e-06, "loss": 0.0432, "num_tokens": 45947081.0, "reward": -0.94482421875, "reward_std": 0.5549002885818481, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.6865234375, "rewards/ppl_reward/std": 2.0406317710876465, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 136.734375, "completions/mean_terminated_length": 136.734375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 3.0986902223576, "grad_norm": 1.5195058584213257, "kl": 1.650390625, "learning_rate": 7.597306353045393e-06, "loss": 0.0287, "num_tokens": 45963312.0, "reward": -1.216552734375, "reward_std": 0.34446755051612854, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.27685546875, "rewards/ppl_reward/std": 3.865511417388916, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 2541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 142.9375, "completions/mean_terminated_length": 142.9375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.099908620164484, "grad_norm": 1.6408905982971191, "kl": 2.482421875, "learning_rate": 7.5890428247355295e-06, "loss": 0.0698, "num_tokens": 45980084.0, "reward": -0.63330078125, "reward_std": 0.4755898118019104, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.0400390625, "rewards/ppl_reward/std": 1.9209505319595337, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 2542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 3.1011270179713675, "grad_norm": 1.896851897239685, "kl": 4.4453125, "learning_rate": 7.580781044003324e-06, "loss": 0.1553, "num_tokens": 45996620.0, "reward": -4.7723388671875, "reward_std": 2.061790943145752, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -13.099365234375, "rewards/ppl_reward/std": 22.38467025756836, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.16810208559036255, "step": 2543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 122.875, "completions/mean_terminated_length": 122.875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 3.1023454157782515, "grad_norm": 1.791845440864563, "kl": 5.216796875, "learning_rate": 7.57252101683731e-06, "loss": 0.2448, "num_tokens": 46011132.0, "reward": -1.6541748046875, "reward_std": 1.3521513938903809, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.878662109375, "rewards/ppl_reward/std": 4.555145263671875, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.17102740705013275, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 127.1875, "completions/mean_terminated_length": 127.1875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 3.1035638135851356, "grad_norm": 1.8871029615402222, "kl": 2.45703125, "learning_rate": 7.5642627492247475e-06, "loss": 0.0517, "num_tokens": 46025656.0, "reward": -3.5755615234375, "reward_std": 2.4195046424865723, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -10.885498046875, "rewards/ppl_reward/std": 19.02334976196289, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 134.328125, "completions/mean_terminated_length": 134.328125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 3.1047822113920196, "grad_norm": 1.8153884410858154, "kl": 4.173828125, "learning_rate": 7.55600624715163e-06, "loss": 0.1268, "num_tokens": 46041749.0, "reward": -1.547119140625, "reward_std": 1.0558035373687744, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.73486328125, "rewards/ppl_reward/std": 4.583136558532715, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.2376670390367508, "step": 2546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 129.109375, "completions/mean_terminated_length": 129.109375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.1060006091989036, "grad_norm": 1.9000234603881836, "kl": 3.470703125, "learning_rate": 7.5477515166026706e-06, "loss": 0.1541, "num_tokens": 46056820.0, "reward": -1.1854248046875, "reward_std": 0.7448173761367798, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.980224609375, "rewards/ppl_reward/std": 3.9790735244750977, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 2547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 127.203125, "completions/mean_terminated_length": 127.203125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 3.1072190070057872, "grad_norm": 1.6932339668273926, "kl": 5.12890625, "learning_rate": 7.5394985635612885e-06, "loss": 0.1707, "num_tokens": 46072201.0, "reward": 0.001220703125, "reward_std": 0.4158262610435486, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -3.65380859375, "rewards/ppl_reward/std": 1.0145894289016724, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09449111670255661, "step": 2548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 118.09375, "completions/mean_terminated_length": 118.09375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 3.1084374048126713, "grad_norm": 1.6307471990585327, "kl": 4.45703125, "learning_rate": 7.531247394009626e-06, "loss": 0.1389, "num_tokens": 46086111.0, "reward": -1.73199462890625, "reward_std": 1.6391116380691528, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.1046142578125, "rewards/ppl_reward/std": 6.266976833343506, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 2549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 141.125, "completions/mean_terminated_length": 127.11112213134766, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 3.1096558026195553, "grad_norm": 2.3028860092163086, "kl": 9.5703125, "learning_rate": 7.522998013928523e-06, "loss": 0.5176, "num_tokens": 46101951.0, "reward": -1.10205078125, "reward_std": 0.5974118113517761, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.8447265625, "rewards/ppl_reward/std": 3.6898910999298096, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.10175786912441254, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 128.984375, "completions/mean_terminated_length": 128.984375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 3.1108742004264394, "grad_norm": 2.3190553188323975, "kl": 6.8671875, "learning_rate": 7.514750429297528e-06, "loss": 0.2339, "num_tokens": 46117254.0, "reward": -1.825927734375, "reward_std": 1.153997778892517, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.15966796875, "rewards/ppl_reward/std": 3.2057082653045654, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2649018466472626, "step": 2551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 126.59375, "completions/mean_terminated_length": 126.59375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 3.112092598233323, "grad_norm": 2.658358573913574, "kl": 6.0029296875, "learning_rate": 7.506504646094893e-06, "loss": 0.1871, "num_tokens": 46132452.0, "reward": -1.039794921875, "reward_std": 0.6861530542373657, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.67333984375, "rewards/ppl_reward/std": 1.7334747314453125, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17817416787147522, "step": 2552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.113310996040207, "grad_norm": 1.787745714187622, "kl": 6.15625, "learning_rate": 7.4982606702975505e-06, "loss": 0.3161, "num_tokens": 46148964.0, "reward": -2.31732177734375, "reward_std": 0.6670494675636292, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.2518310546875, "rewards/ppl_reward/std": 5.661502361297607, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.14211894571781158, "step": 2553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 118.828125, "completions/mean_terminated_length": 118.828125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 3.114529393847091, "grad_norm": 2.587344169616699, "kl": 3.908203125, "learning_rate": 7.490018507881129e-06, "loss": 0.1767, "num_tokens": 46162777.0, "reward": -3.0595703125, "reward_std": 0.9106003046035767, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -9.814453125, "rewards/ppl_reward/std": 6.149753093719482, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 2554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 129.0625, "completions/mean_terminated_length": 129.0625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.115747791653975, "grad_norm": 2.4942212104797363, "kl": 3.021484375, "learning_rate": 7.481778164819948e-06, "loss": 0.059, "num_tokens": 46178645.0, "reward": -1.9620361328125, "reward_std": 1.0593715906143188, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.728759765625, "rewards/ppl_reward/std": 4.2571187019348145, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.21445617079734802, "step": 2555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 133.171875, "completions/mean_terminated_length": 133.171875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 3.116966189460859, "grad_norm": 1.8347764015197754, "kl": 7.029296875, "learning_rate": 7.473539647087007e-06, "loss": 0.303, "num_tokens": 46194848.0, "reward": -2.121826171875, "reward_std": 1.3599052429199219, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.72802734375, "rewards/ppl_reward/std": 6.496512413024902, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.23239074647426605, "step": 2556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 134.34375, "completions/mean_terminated_length": 134.34375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 3.1181845872677427, "grad_norm": 2.871893882751465, "kl": 7.5, "learning_rate": 7.465302960653975e-06, "loss": 0.4227, "num_tokens": 46210430.0, "reward": -1.0694580078125, "reward_std": 0.9251581430435181, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.670166015625, "rewards/ppl_reward/std": 3.6575591564178467, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.16060402989387512, "step": 2557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 135.765625, "completions/mean_terminated_length": 135.765625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 3.1194029850746268, "grad_norm": 1.7316415309906006, "kl": 5.0146484375, "learning_rate": 7.457068111491199e-06, "loss": 0.2712, "num_tokens": 46226151.0, "reward": -0.382568359375, "reward_std": 0.4979429244995117, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.47607421875, "rewards/ppl_reward/std": 1.5026121139526367, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09827063977718353, "step": 2558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 147.703125, "completions/mean_terminated_length": 147.703125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.120621382881511, "grad_norm": 2.897047996520996, "kl": 6.85546875, "learning_rate": 7.448835105567695e-06, "loss": 0.253, "num_tokens": 46243508.0, "reward": -0.55438232421875, "reward_std": 0.9011848568916321, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.5853271484375, "rewards/ppl_reward/std": 2.456943988800049, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2025614231824875, "step": 2559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 129.421875, "completions/mean_terminated_length": 129.421875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.121839780688395, "grad_norm": 1.6358357667922974, "kl": 2.2958984375, "learning_rate": 7.440603948851142e-06, "loss": 0.0385, "num_tokens": 46258943.0, "reward": -0.9605712890625, "reward_std": 0.40353262424468994, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.710205078125, "rewards/ppl_reward/std": 2.868438959121704, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 120.765625, "completions/mean_terminated_length": 120.765625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 3.123058178495279, "grad_norm": 2.068902015686035, "kl": 8.171875, "learning_rate": 7.432374647307878e-06, "loss": 0.29, "num_tokens": 46273520.0, "reward": -3.34375, "reward_std": 1.4326825141906738, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -10.125, "rewards/ppl_reward/std": 10.022062301635742, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2083333432674408, "step": 2561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 140.25, "completions/mean_terminated_length": 140.25, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 3.1242765763021625, "grad_norm": 2.0884149074554443, "kl": 3.6640625, "learning_rate": 7.4241472069028915e-06, "loss": 0.1888, "num_tokens": 46289760.0, "reward": -1.1416015625, "reward_std": 0.5388908386230469, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.009765625, "rewards/ppl_reward/std": 3.7126972675323486, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 2562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 146.53125, "completions/mean_terminated_length": 146.53125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 3.1254949741090465, "grad_norm": 1.828455924987793, "kl": 4.583984375, "learning_rate": 7.4159216335998345e-06, "loss": 0.2668, "num_tokens": 46306938.0, "reward": -2.451904296875, "reward_std": 0.6088034510612488, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.57568359375, "rewards/ppl_reward/std": 5.6127753257751465, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 2563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 127.015625, "completions/mean_terminated_length": 127.015625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 3.1267133719159306, "grad_norm": 3.6511807441711426, "kl": 3.748046875, "learning_rate": 7.407697933360989e-06, "loss": 0.1854, "num_tokens": 46321507.0, "reward": -0.87982177734375, "reward_std": 0.5591962933540344, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.4862060546875, "rewards/ppl_reward/std": 3.0843324661254883, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 2564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 120.125, "completions/mean_terminated_length": 120.125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.1279317697228146, "grad_norm": 2.260119676589966, "kl": 5.32421875, "learning_rate": 7.399476112147295e-06, "loss": 0.1348, "num_tokens": 46335739.0, "reward": -1.5196533203125, "reward_std": 0.7304182648658752, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.601806640625, "rewards/ppl_reward/std": 5.5234761238098145, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17817416787147522, "step": 2565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 139.625, "completions/mean_terminated_length": 139.625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 3.1291501675296987, "grad_norm": 2.1921615600585938, "kl": 5.375, "learning_rate": 7.391256175918324e-06, "loss": 0.1899, "num_tokens": 46351435.0, "reward": -3.599853515625, "reward_std": 0.9350149035453796, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -10.76220703125, "rewards/ppl_reward/std": 9.943425178527832, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17251639068126678, "step": 2566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 121.59375, "completions/mean_terminated_length": 121.59375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 3.1303685653365823, "grad_norm": 2.1725382804870605, "kl": 4.95703125, "learning_rate": 7.3830381306322765e-06, "loss": 0.1901, "num_tokens": 46366273.0, "reward": -0.931884765625, "reward_std": 0.720527172088623, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.45751953125, "rewards/ppl_reward/std": 3.1775012016296387, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18898223340511322, "step": 2567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 127.625, "completions/mean_terminated_length": 127.625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 3.1315869631434663, "grad_norm": 1.9507943391799927, "kl": 2.9912109375, "learning_rate": 7.374821982245991e-06, "loss": 0.0378, "num_tokens": 46381617.0, "reward": -0.5322265625, "reward_std": 0.4452901780605316, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.798828125, "rewards/ppl_reward/std": 2.474104166030884, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1423913538455963, "step": 2568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 129.296875, "completions/mean_terminated_length": 129.296875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 3.1328053609503503, "grad_norm": 1.4028832912445068, "kl": 3.8935546875, "learning_rate": 7.366607736714922e-06, "loss": 0.0925, "num_tokens": 46396644.0, "reward": -6.29443359375, "reward_std": 6.724118232727051, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -16.1748046875, "rewards/ppl_reward/std": 42.61189270019531, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 2569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 126.6875, "completions/mean_terminated_length": 126.6875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 3.1340237587572344, "grad_norm": 1.6160051822662354, "kl": 3.26171875, "learning_rate": 7.358395399993158e-06, "loss": 0.0627, "num_tokens": 46411848.0, "reward": -0.91357421875, "reward_std": 1.003962516784668, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.4990234375, "rewards/ppl_reward/std": 5.328691482543945, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 2570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 117.578125, "completions/mean_terminated_length": 117.578125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.135242156564118, "grad_norm": 2.1271965503692627, "kl": 4.19921875, "learning_rate": 7.350184978033386e-06, "loss": 0.0604, "num_tokens": 46425685.0, "reward": -0.9423828125, "reward_std": 0.9703872799873352, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.462890625, "rewards/ppl_reward/std": 3.7802891731262207, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 2571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 126.78125, "completions/mean_terminated_length": 126.78125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.136460554371002, "grad_norm": 2.7658393383026123, "kl": 4.47265625, "learning_rate": 7.341976476786918e-06, "loss": 0.1331, "num_tokens": 46440887.0, "reward": -3.8843994140625, "reward_std": 0.9670130014419556, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -11.425048828125, "rewards/ppl_reward/std": 13.145614624023438, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 2572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 126.21875, "completions/mean_terminated_length": 126.21875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 3.137678952177886, "grad_norm": 1.5232785940170288, "kl": 4.66015625, "learning_rate": 7.333769902203671e-06, "loss": 0.1104, "num_tokens": 46456133.0, "reward": -1.7977294921875, "reward_std": 1.3473613262176514, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.056396484375, "rewards/ppl_reward/std": 4.650004863739014, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.19444002211093903, "step": 2573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 124.671875, "completions/mean_terminated_length": 124.671875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 3.13889734998477, "grad_norm": 1.5864146947860718, "kl": 5.6484375, "learning_rate": 7.325565260232164e-06, "loss": 0.2304, "num_tokens": 46470744.0, "reward": -2.0491943359375, "reward_std": 1.356453776359558, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.598388671875, "rewards/ppl_reward/std": 4.268540382385254, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2083333432674408, "step": 2574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 123.21875, "completions/mean_terminated_length": 123.21875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.140115747791654, "grad_norm": 1.536827564239502, "kl": 4.453125, "learning_rate": 7.317362556819516e-06, "loss": 0.0759, "num_tokens": 46485502.0, "reward": -0.8192138671875, "reward_std": 1.323073387145996, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.200927734375, "rewards/ppl_reward/std": 4.7737202644348145, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.23935678601264954, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 145.125, "completions/mean_terminated_length": 145.125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.1413341455985377, "grad_norm": 1.3605173826217651, "kl": 3.46875, "learning_rate": 7.309161797911442e-06, "loss": 0.0603, "num_tokens": 46502630.0, "reward": -2.14111328125, "reward_std": 0.8464851379394531, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.8369140625, "rewards/ppl_reward/std": 5.443967819213867, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2184663861989975, "step": 2576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 124.078125, "completions/mean_terminated_length": 124.078125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.142552543405422, "grad_norm": 1.5814263820648193, "kl": 2.919921875, "learning_rate": 7.300962989452242e-06, "loss": 0.007, "num_tokens": 46516979.0, "reward": -1.1893310546875, "reward_std": 0.4555569291114807, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.167724609375, "rewards/ppl_reward/std": 4.840409278869629, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 2577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 134.9375, "completions/mean_terminated_length": 134.9375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 3.143770941212306, "grad_norm": 1.7867871522903442, "kl": 4.390625, "learning_rate": 7.292766137384815e-06, "loss": 0.1928, "num_tokens": 46532647.0, "reward": -2.0064697265625, "reward_std": 0.8562486171722412, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.692626953125, "rewards/ppl_reward/std": 6.361851215362549, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09827063977718353, "step": 2578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 139.28125, "completions/mean_terminated_length": 139.28125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 3.14498933901919, "grad_norm": 1.3629143238067627, "kl": 2.896484375, "learning_rate": 7.284571247650629e-06, "loss": 0.0406, "num_tokens": 46548913.0, "reward": -0.080322265625, "reward_std": 0.46585729718208313, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -3.93408203125, "rewards/ppl_reward/std": 1.4398716688156128, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 2579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 128.453125, "completions/mean_terminated_length": 128.453125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.146207736826074, "grad_norm": 1.670192003250122, "kl": 4.41796875, "learning_rate": 7.276378326189729e-06, "loss": 0.1229, "num_tokens": 46563430.0, "reward": -1.57781982421875, "reward_std": 0.8037194013595581, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.7650146484375, "rewards/ppl_reward/std": 5.0016913414001465, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 137.671875, "completions/mean_terminated_length": 137.671875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.1474261346329575, "grad_norm": 1.5887786149978638, "kl": 3.806640625, "learning_rate": 7.2681873789407435e-06, "loss": 0.0846, "num_tokens": 46579009.0, "reward": -4.3140869140625, "reward_std": 1.9066963195800781, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -12.307861328125, "rewards/ppl_reward/std": 14.093560218811035, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 2581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 144.890625, "completions/mean_terminated_length": 144.890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 3.1486445324398415, "grad_norm": 3.386188268661499, "kl": 6.46875, "learning_rate": 7.2599984118408625e-06, "loss": 0.2253, "num_tokens": 46595666.0, "reward": -2.411376953125, "reward_std": 1.2693204879760742, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.39306640625, "rewards/ppl_reward/std": 5.9268012046813965, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 2582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 140.390625, "completions/mean_terminated_length": 140.390625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 3.1498629302467256, "grad_norm": 2.31123948097229, "kl": 1.9072265625, "learning_rate": 7.251811430825846e-06, "loss": 0.0868, "num_tokens": 46611467.0, "reward": -1.76513671875, "reward_std": 0.3746934235095978, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -7.4833984375, "rewards/ppl_reward/std": 4.647508144378662, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 2583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 135.078125, "completions/mean_terminated_length": 135.078125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.1510813280536096, "grad_norm": 1.5365813970565796, "kl": 3.693359375, "learning_rate": 7.243626441830009e-06, "loss": 0.0552, "num_tokens": 46626432.0, "reward": -2.9979248046875, "reward_std": 1.4910008907318115, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -9.698974609375, "rewards/ppl_reward/std": 7.892162322998047, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 2584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 141.609375, "completions/mean_terminated_length": 141.609375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 3.1522997258604937, "grad_norm": 1.617783784866333, "kl": 4.0703125, "learning_rate": 7.235443450786226e-06, "loss": 0.1132, "num_tokens": 46642055.0, "reward": -3.35791015625, "reward_std": 1.2688205242156982, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -10.3095703125, "rewards/ppl_reward/std": 11.96144962310791, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.208927720785141, "step": 2585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 133.609375, "completions/mean_terminated_length": 133.609375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 3.1535181236673773, "grad_norm": 1.533518671989441, "kl": 1.4833984375, "learning_rate": 7.22726246362592e-06, "loss": -0.0278, "num_tokens": 46657390.0, "reward": -1.21337890625, "reward_std": 0.5133859515190125, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -6.3330078125, "rewards/ppl_reward/std": 3.6627306938171387, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 2586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 162.34375, "completions/mean_terminated_length": 162.34375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 3.1547365214742613, "grad_norm": 1.6367573738098145, "kl": 1.4140625, "learning_rate": 7.219083486279068e-06, "loss": 0.1213, "num_tokens": 46675452.0, "reward": -0.4346923828125, "reward_std": 0.1845598816871643, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.767822265625, "rewards/ppl_reward/std": 2.4441585540771484, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 2587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 139.46875, "completions/mean_terminated_length": 139.46875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 3.1559549192811454, "grad_norm": 1.9516574144363403, "kl": 3.720703125, "learning_rate": 7.210906524674187e-06, "loss": 0.1299, "num_tokens": 46690770.0, "reward": -2.9185791015625, "reward_std": 0.907960057258606, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -9.587158203125, "rewards/ppl_reward/std": 11.022764205932617, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 2588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 157.171875, "completions/mean_terminated_length": 157.171875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.1571733170880294, "grad_norm": 1.8938087224960327, "kl": 5.201171875, "learning_rate": 7.202731584738323e-06, "loss": 0.2124, "num_tokens": 46707613.0, "reward": -1.7091064453125, "reward_std": 1.4018142223358154, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.090087890625, "rewards/ppl_reward/std": 6.5332818031311035, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 2589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 159.015625, "completions/mean_terminated_length": 159.015625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 3.158391714894913, "grad_norm": 1.7767462730407715, "kl": 4.6220703125, "learning_rate": 7.194558672397072e-06, "loss": 0.2411, "num_tokens": 46725718.0, "reward": -0.8587646484375, "reward_std": 0.5826424360275269, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.365966796875, "rewards/ppl_reward/std": 2.1780898571014404, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09827063977718353, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 145.8125, "completions/mean_terminated_length": 145.8125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 3.159610112701797, "grad_norm": 2.5509111881256104, "kl": 2.9609375, "learning_rate": 7.186387793574554e-06, "loss": 0.0382, "num_tokens": 46741794.0, "reward": -1.20361328125, "reward_std": 0.9037159085273743, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.1884765625, "rewards/ppl_reward/std": 3.253570556640625, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 2591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 149.75, "completions/mean_terminated_length": 149.75, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 3.160828510508681, "grad_norm": 2.1493515968322754, "kl": 3.173828125, "learning_rate": 7.17821895419341e-06, "loss": 0.0936, "num_tokens": 46758722.0, "reward": -1.5748291015625, "reward_std": 0.9558581113815308, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.860595703125, "rewards/ppl_reward/std": 6.220080852508545, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11672777682542801, "step": 2592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 144.046875, "completions/mean_terminated_length": 144.046875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.162046908315565, "grad_norm": 2.022231101989746, "kl": 5.9765625, "learning_rate": 7.170052160174813e-06, "loss": 0.1985, "num_tokens": 46775525.0, "reward": -1.77197265625, "reward_std": 1.3736263513565063, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.1376953125, "rewards/ppl_reward/std": 5.4521613121032715, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09449111670255661, "step": 2593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 137.15625, "completions/mean_terminated_length": 137.15625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 3.163265306122449, "grad_norm": 1.838220477104187, "kl": 3.3046875, "learning_rate": 7.161887417438447e-06, "loss": 0.0941, "num_tokens": 46790767.0, "reward": -1.7747802734375, "reward_std": 0.4629971981048584, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.354248046875, "rewards/ppl_reward/std": 4.732234001159668, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 2594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 130.921875, "completions/mean_terminated_length": 130.921875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 3.1644837039293328, "grad_norm": 2.2287487983703613, "kl": 6.921875, "learning_rate": 7.153724731902506e-06, "loss": 0.2645, "num_tokens": 46805562.0, "reward": -2.14990234375, "reward_std": 1.866194248199463, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -7.7373046875, "rewards/ppl_reward/std": 5.904135704040527, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.19352105259895325, "step": 2595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 128.5, "completions/mean_terminated_length": 128.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.165702101736217, "grad_norm": 4.120550632476807, "kl": 8.484375, "learning_rate": 7.145564109483705e-06, "loss": 0.2242, "num_tokens": 46820210.0, "reward": -1.9036865234375, "reward_std": 2.022855520248413, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -7.104248046875, "rewards/ppl_reward/std": 8.409647941589355, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.19527530670166016, "step": 2596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 154.75, "completions/mean_terminated_length": 154.75, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 3.166920499543101, "grad_norm": 3.2908926010131836, "kl": 8.265625, "learning_rate": 7.1374055560972435e-06, "loss": 0.2907, "num_tokens": 46837330.0, "reward": -0.5845947265625, "reward_std": 0.9891967177391052, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -4.661376953125, "rewards/ppl_reward/std": 2.8852126598358154, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2184663861989975, "step": 2597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 150.78125, "completions/mean_terminated_length": 150.78125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 3.168138897349985, "grad_norm": 2.038423776626587, "kl": 6.68359375, "learning_rate": 7.129249077656844e-06, "loss": 0.268, "num_tokens": 46854052.0, "reward": -1.050048828125, "reward_std": 0.9270248413085938, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.74853515625, "rewards/ppl_reward/std": 2.509202718734741, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.228183776140213, "step": 2598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 139.84375, "completions/mean_terminated_length": 139.84375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.169357295156869, "grad_norm": 1.802713394165039, "kl": 4.16796875, "learning_rate": 7.121094680074707e-06, "loss": 0.0446, "num_tokens": 46869978.0, "reward": -1.4246826171875, "reward_std": 0.5869272947311401, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.521240234375, "rewards/ppl_reward/std": 4.095688819885254, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16194961965084076, "step": 2599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 147.578125, "completions/mean_terminated_length": 147.578125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 3.1705756929637525, "grad_norm": 2.077249050140381, "kl": 5.7421875, "learning_rate": 7.112942369261537e-06, "loss": 0.2023, "num_tokens": 46886703.0, "reward": -0.9385986328125, "reward_std": 0.7286853194236755, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.431884765625, "rewards/ppl_reward/std": 2.8789620399475098, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.13495801389217377, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 135.796875, "completions/mean_terminated_length": 135.796875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.1717940907706366, "grad_norm": 1.8059602975845337, "kl": 3.921875, "learning_rate": 7.104792151126515e-06, "loss": 0.1115, "num_tokens": 46901610.0, "reward": -1.254638671875, "reward_std": 0.8327597379684448, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.04052734375, "rewards/ppl_reward/std": 2.408154010772705, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1666666716337204, "step": 2601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 154.28125, "completions/mean_terminated_length": 154.28125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 3.1730124885775206, "grad_norm": 3.5364649295806885, "kl": 7.04296875, "learning_rate": 7.096644031577316e-06, "loss": 0.1922, "num_tokens": 46918740.0, "reward": -1.110107421875, "reward_std": 0.6012626886367798, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.75927734375, "rewards/ppl_reward/std": 3.166403293609619, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11404092609882355, "step": 2602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 145.59375, "completions/mean_terminated_length": 145.59375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 3.1742308863844046, "grad_norm": 1.4975149631500244, "kl": 6.1171875, "learning_rate": 7.088498016520088e-06, "loss": 0.249, "num_tokens": 46934858.0, "reward": -2.3067626953125, "reward_std": 0.9779525995254517, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.230712890625, "rewards/ppl_reward/std": 7.583371162414551, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10789459943771362, "step": 2603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 152.703125, "completions/mean_terminated_length": 152.703125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 3.1754492841912887, "grad_norm": 2.494511127471924, "kl": 7.37109375, "learning_rate": 7.080354111859451e-06, "loss": 0.2381, "num_tokens": 46951647.0, "reward": -1.7974853515625, "reward_std": 1.4334290027618408, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -6.993408203125, "rewards/ppl_reward/std": 4.895442962646484, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17951758205890656, "step": 2604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 142.90625, "completions/mean_terminated_length": 142.90625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.1766676819981723, "grad_norm": 2.841768741607666, "kl": 5.197265625, "learning_rate": 7.0722123234985066e-06, "loss": 0.2269, "num_tokens": 46967553.0, "reward": -0.58544921875, "reward_std": 0.5788865685462952, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.7646484375, "rewards/ppl_reward/std": 2.485548496246338, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 2605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 148.328125, "completions/mean_terminated_length": 148.328125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 3.1778860798050563, "grad_norm": 1.8225312232971191, "kl": 3.94140625, "learning_rate": 7.064072657338803e-06, "loss": 0.0515, "num_tokens": 46983974.0, "reward": -1.52099609375, "reward_std": 1.045445203781128, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.6435546875, "rewards/ppl_reward/std": 3.511512517929077, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.2028672844171524, "step": 2606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 147.53125, "completions/mean_terminated_length": 147.53125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 3.1791044776119404, "grad_norm": 1.5875264406204224, "kl": 3.14453125, "learning_rate": 7.055935119280369e-06, "loss": 0.0804, "num_tokens": 46999936.0, "reward": -3.4735107421875, "reward_std": 0.8595156669616699, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -10.634521484375, "rewards/ppl_reward/std": 7.557288646697998, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 2607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 149.703125, "completions/mean_terminated_length": 149.703125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.1803228754188244, "grad_norm": 1.5916584730148315, "kl": 4.859375, "learning_rate": 7.047799715221682e-06, "loss": 0.1098, "num_tokens": 47016237.0, "reward": -1.327880859375, "reward_std": 0.9895291924476624, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.12451171875, "rewards/ppl_reward/std": 2.4303319454193115, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 2608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 152.15625, "completions/mean_terminated_length": 152.15625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 3.181541273225708, "grad_norm": 1.719580888748169, "kl": 3.3984375, "learning_rate": 7.03966645105967e-06, "loss": 0.0595, "num_tokens": 47033271.0, "reward": -1.06414794921875, "reward_std": 0.5493143796920776, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.8236083984375, "rewards/ppl_reward/std": 5.054355144500732, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 2609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 152.625, "completions/mean_terminated_length": 152.625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 3.182759671032592, "grad_norm": 2.455711603164673, "kl": 6.0234375, "learning_rate": 7.031535332689722e-06, "loss": 0.2854, "num_tokens": 47049567.0, "reward": -1.518798828125, "reward_std": 1.111680030822754, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.45166015625, "rewards/ppl_reward/std": 3.0855255126953125, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.24138814210891724, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 149.59375, "completions/mean_terminated_length": 149.59375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.183978068839476, "grad_norm": 1.6993485689163208, "kl": 2.1640625, "learning_rate": 7.023406366005655e-06, "loss": 0.0264, "num_tokens": 47065621.0, "reward": -1.3192138671875, "reward_std": 1.1732085943222046, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.333740234375, "rewards/ppl_reward/std": 5.18096923828125, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 2611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 159.0625, "completions/mean_terminated_length": 159.0625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 3.18519646664636, "grad_norm": 1.7231662273406982, "kl": 2.7294921875, "learning_rate": 7.015279556899739e-06, "loss": 0.0373, "num_tokens": 47083273.0, "reward": -0.823486328125, "reward_std": 0.8716440200805664, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.24072265625, "rewards/ppl_reward/std": 3.877373218536377, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1836577206850052, "step": 2612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 134.828125, "completions/mean_terminated_length": 134.828125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 3.186414864453244, "grad_norm": 1.582781434059143, "kl": 4.17578125, "learning_rate": 7.007154911262678e-06, "loss": 0.0694, "num_tokens": 47098494.0, "reward": -0.7222900390625, "reward_std": 1.1185142993927002, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.022705078125, "rewards/ppl_reward/std": 2.856818437576294, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.23345555365085602, "step": 2613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 136.0, "completions/mean_terminated_length": 136.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 3.1876332622601278, "grad_norm": 1.6111632585525513, "kl": 2.763671875, "learning_rate": 6.999032434983606e-06, "loss": 0.012, "num_tokens": 47113606.0, "reward": -0.9205322265625, "reward_std": 0.9075577259063721, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.591064453125, "rewards/ppl_reward/std": 3.003082513809204, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 2614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 144.3125, "completions/mean_terminated_length": 144.3125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 3.188851660067012, "grad_norm": 1.610287070274353, "kl": 2.734375, "learning_rate": 6.990912133950076e-06, "loss": 0.0157, "num_tokens": 47129410.0, "reward": -1.6607666015625, "reward_std": 0.8306893706321716, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.993408203125, "rewards/ppl_reward/std": 4.578327655792236, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.21578919887542725, "step": 2615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 144.625, "completions/mean_terminated_length": 144.625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 3.190070057873896, "grad_norm": 2.345747232437134, "kl": 2.802734375, "learning_rate": 6.9827940140480776e-06, "loss": 0.0675, "num_tokens": 47145330.0, "reward": -0.8536376953125, "reward_std": 0.7596184611320496, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.449462890625, "rewards/ppl_reward/std": 3.4412102699279785, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 2616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 150.71875, "completions/mean_terminated_length": 150.71875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 3.19128845568078, "grad_norm": 1.577114462852478, "kl": 4.03515625, "learning_rate": 6.974678081162019e-06, "loss": 0.0918, "num_tokens": 47162856.0, "reward": -0.21044921875, "reward_std": 0.5187861919403076, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.1318359375, "rewards/ppl_reward/std": 1.1805118322372437, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 2617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 139.78125, "completions/mean_terminated_length": 139.78125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 3.1925068534876635, "grad_norm": 1.7748262882232666, "kl": 2.99609375, "learning_rate": 6.966564341174709e-06, "loss": 0.1036, "num_tokens": 47178314.0, "reward": -2.463623046875, "reward_std": 0.6816343069076538, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.68505859375, "rewards/ppl_reward/std": 5.23380708694458, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07864411175251007, "step": 2618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 144.84375, "completions/mean_terminated_length": 130.88890075683594, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 3.1937252512945475, "grad_norm": 2.134695529937744, "kl": 8.81640625, "learning_rate": 6.958452799967385e-06, "loss": 0.3846, "num_tokens": 47194152.0, "reward": -3.8233642578125, "reward_std": 1.3446733951568604, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -11.146728515625, "rewards/ppl_reward/std": 14.158782958984375, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19920477271080017, "step": 2619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 137.9375, "completions/mean_terminated_length": 137.9375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 3.1949436491014316, "grad_norm": 1.9426063299179077, "kl": 5.078125, "learning_rate": 6.950343463419678e-06, "loss": 0.0561, "num_tokens": 47209316.0, "reward": -0.561279296875, "reward_std": 1.2101550102233887, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -4.53662109375, "rewards/ppl_reward/std": 3.0907211303710938, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.25341787934303284, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 137.484375, "completions/mean_terminated_length": 137.484375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.1961620469083156, "grad_norm": 1.6204949617385864, "kl": 5.4140625, "learning_rate": 6.942236337409623e-06, "loss": 0.1604, "num_tokens": 47224499.0, "reward": -0.48223876953125, "reward_std": 0.5399947166442871, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.5816650390625, "rewards/ppl_reward/std": 2.38966965675354, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09827063977718353, "step": 2621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 143.90625, "completions/mean_terminated_length": 143.90625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 3.1973804447151997, "grad_norm": 1.475284457206726, "kl": 2.265625, "learning_rate": 6.93413142781366e-06, "loss": 0.0124, "num_tokens": 47241069.0, "reward": -0.8111572265625, "reward_std": 0.7974005937576294, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.403564453125, "rewards/ppl_reward/std": 2.74625563621521, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1985812783241272, "step": 2622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 151.3125, "completions/mean_terminated_length": 151.3125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.1985988425220833, "grad_norm": 2.213909387588501, "kl": 6.09765625, "learning_rate": 6.926028740506616e-06, "loss": 0.2687, "num_tokens": 47257353.0, "reward": -2.4725341796875, "reward_std": 2.5088603496551514, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.554443359375, "rewards/ppl_reward/std": 9.04465103149414, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.25283610820770264, "step": 2623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 143.0625, "completions/mean_terminated_length": 143.0625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.1998172403289673, "grad_norm": 1.6508910655975342, "kl": 4.427734375, "learning_rate": 6.917928281361703e-06, "loss": 0.1366, "num_tokens": 47272941.0, "reward": -1.9276123046875, "reward_std": 0.7101242542266846, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.527099609375, "rewards/ppl_reward/std": 7.067391872406006, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 2624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 129.5625, "completions/mean_terminated_length": 129.5625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 3.2010356381358513, "grad_norm": 1.6387276649475098, "kl": 5.6328125, "learning_rate": 6.909830056250527e-06, "loss": 0.0916, "num_tokens": 47288289.0, "reward": -1.9444580078125, "reward_std": 1.1600866317749023, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -7.326416015625, "rewards/ppl_reward/std": 7.96708869934082, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1985812783241272, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 136.984375, "completions/mean_terminated_length": 136.984375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 3.2022540359427354, "grad_norm": 1.700164794921875, "kl": 2.3359375, "learning_rate": 6.901734071043072e-06, "loss": 0.0538, "num_tokens": 47303768.0, "reward": -0.81024169921875, "reward_std": 0.2834228277206421, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.4642333984375, "rewards/ppl_reward/std": 2.570368528366089, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 2626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 147.34375, "completions/mean_terminated_length": 147.34375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.2034724337496194, "grad_norm": 2.463207721710205, "kl": 8.3818359375, "learning_rate": 6.893640331607694e-06, "loss": 0.4008, "num_tokens": 47320118.0, "reward": -2.66015625, "reward_std": 0.9347673058509827, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.8515625, "rewards/ppl_reward/std": 2.605130910873413, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19920477271080017, "step": 2627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 143.21875, "completions/mean_terminated_length": 143.21875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 3.204690831556503, "grad_norm": 1.7490381002426147, "kl": 3.06640625, "learning_rate": 6.885548843811125e-06, "loss": 0.0548, "num_tokens": 47336060.0, "reward": -0.37298583984375, "reward_std": 0.5150330066680908, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.4178466796875, "rewards/ppl_reward/std": 3.458749532699585, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 2628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 157.203125, "completions/mean_terminated_length": 157.203125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.205909229363387, "grad_norm": 1.5421422719955444, "kl": 5.75, "learning_rate": 6.8774596135184694e-06, "loss": 0.2608, "num_tokens": 47353225.0, "reward": -4.607421875, "reward_std": 3.792886972427368, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -12.85546875, "rewards/ppl_reward/std": 24.224889755249023, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18617255985736847, "step": 2629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 146.03125, "completions/mean_terminated_length": 146.03125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 3.207127627170271, "grad_norm": 1.6151187419891357, "kl": 3.90625, "learning_rate": 6.869372646593182e-06, "loss": 0.0941, "num_tokens": 47369947.0, "reward": -0.129638671875, "reward_std": 0.7243461608886719, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -3.75146484375, "rewards/ppl_reward/std": 1.0974630117416382, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.22736713290214539, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 150.828125, "completions/mean_terminated_length": 150.828125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 3.208346024977155, "grad_norm": 1.8595614433288574, "kl": 3.8671875, "learning_rate": 6.861287948897091e-06, "loss": 0.1842, "num_tokens": 47386120.0, "reward": -0.7227783203125, "reward_std": 0.38591694831848145, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.289306640625, "rewards/ppl_reward/std": 1.9859123229980469, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 2631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 163.109375, "completions/mean_terminated_length": 163.109375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 3.209564422784039, "grad_norm": 1.2955807447433472, "kl": 2.8046875, "learning_rate": 6.853205526290376e-06, "loss": 0.0412, "num_tokens": 47403143.0, "reward": -0.781982421875, "reward_std": 0.9305940270423889, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.31396484375, "rewards/ppl_reward/std": 3.372558355331421, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1883249133825302, "step": 2632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 158.65625, "completions/mean_terminated_length": 158.65625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 3.210782820590923, "grad_norm": 2.1245572566986084, "kl": 4.619140625, "learning_rate": 6.845125384631559e-06, "loss": 0.2437, "num_tokens": 47420985.0, "reward": -1.08251953125, "reward_std": 1.5263954401016235, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.9306640625, "rewards/ppl_reward/std": 7.34926700592041, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 2633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 149.453125, "completions/mean_terminated_length": 149.453125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 3.212001218397807, "grad_norm": 2.4312028884887695, "kl": 5.54296875, "learning_rate": 6.837047529777516e-06, "loss": 0.1533, "num_tokens": 47437830.0, "reward": -0.72625732421875, "reward_std": 1.2750303745269775, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -4.7728271484375, "rewards/ppl_reward/std": 2.5071046352386475, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2781464755535126, "step": 2634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 144.734375, "completions/mean_terminated_length": 144.734375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.213219616204691, "grad_norm": 1.300804853439331, "kl": 2.84765625, "learning_rate": 6.828971967583468e-06, "loss": 0.0583, "num_tokens": 47454045.0, "reward": 0.03668212890625, "reward_std": 0.4262198805809021, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -3.6531982421875, "rewards/ppl_reward/std": 1.3339142799377441, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1550549566745758, "step": 2635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 150.28125, "completions/mean_terminated_length": 150.28125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.214438014011575, "grad_norm": 1.844274640083313, "kl": 4.8125, "learning_rate": 6.820898703902969e-06, "loss": 0.1426, "num_tokens": 47471359.0, "reward": -1.09857177734375, "reward_std": 0.6598945260047913, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.7362060546875, "rewards/ppl_reward/std": 4.0997233390808105, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1876239776611328, "step": 2636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 164.46875, "completions/mean_terminated_length": 164.46875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 3.2156564118184585, "grad_norm": 2.1876585483551025, "kl": 2.81640625, "learning_rate": 6.812827744587906e-06, "loss": 0.1571, "num_tokens": 47489693.0, "reward": -3.2392578125, "reward_std": 0.45968109369277954, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -10.330078125, "rewards/ppl_reward/std": 4.712650299072266, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 2637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 143.703125, "completions/mean_terminated_length": 143.703125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 3.2168748096253426, "grad_norm": 2.505282163619995, "kl": 4.255859375, "learning_rate": 6.804759095488504e-06, "loss": 0.1496, "num_tokens": 47506762.0, "reward": -1.87109375, "reward_std": 0.679681658744812, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.4453125, "rewards/ppl_reward/std": 4.487252235412598, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 2638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 135.546875, "completions/mean_terminated_length": 135.546875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 3.2180932074322266, "grad_norm": 1.8658778667449951, "kl": 5.3828125, "learning_rate": 6.796692762453303e-06, "loss": 0.1864, "num_tokens": 47521885.0, "reward": -1.9490966796875, "reward_std": 1.344637155532837, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.609130859375, "rewards/ppl_reward/std": 7.720335006713867, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1791718602180481, "step": 2639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 147.046875, "completions/mean_terminated_length": 147.046875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 3.2193116052391106, "grad_norm": 1.7885019779205322, "kl": 4.966796875, "learning_rate": 6.78862875132917e-06, "loss": 0.1874, "num_tokens": 47538296.0, "reward": -1.068359375, "reward_std": 0.4742177426815033, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.79296875, "rewards/ppl_reward/std": 3.316617488861084, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09449111670255661, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 133.796875, "completions/mean_terminated_length": 133.796875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 3.2205300030459947, "grad_norm": 2.383615732192993, "kl": 4.216796875, "learning_rate": 6.780567067961293e-06, "loss": 0.0849, "num_tokens": 47553659.0, "reward": -0.3961181640625, "reward_std": 0.8890777826309204, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.378173828125, "rewards/ppl_reward/std": 2.0873496532440186, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.24010038375854492, "step": 2641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 157.0625, "completions/mean_terminated_length": 157.0625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 3.2217484008528783, "grad_norm": 2.298149824142456, "kl": 6.76953125, "learning_rate": 6.772507718193161e-06, "loss": 0.309, "num_tokens": 47571751.0, "reward": -5.266357421875, "reward_std": 1.2975106239318848, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -14.06396484375, "rewards/ppl_reward/std": 17.396244049072266, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18898223340511322, "step": 2642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 145.859375, "completions/mean_terminated_length": 145.859375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 3.2229667986597623, "grad_norm": 1.5506629943847656, "kl": 4.87109375, "learning_rate": 6.764450707866577e-06, "loss": 0.1498, "num_tokens": 47588326.0, "reward": -1.50262451171875, "reward_std": 0.721856951713562, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.7396240234375, "rewards/ppl_reward/std": 2.796741008758545, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 2643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 143.125, "completions/mean_terminated_length": 143.125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.2241851964666464, "grad_norm": 2.5048186779022217, "kl": 2.9140625, "learning_rate": 6.756396042821653e-06, "loss": -0.0095, "num_tokens": 47604022.0, "reward": -1.6370849609375, "reward_std": 0.3524841368198395, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.164794921875, "rewards/ppl_reward/std": 2.788269281387329, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 2644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 157.078125, "completions/mean_terminated_length": 157.078125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.2254035942735304, "grad_norm": 1.5648362636566162, "kl": 4.142578125, "learning_rate": 6.7483437288968e-06, "loss": 0.0617, "num_tokens": 47621979.0, "reward": -2.2293701171875, "reward_std": 1.7231711149215698, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.982177734375, "rewards/ppl_reward/std": 8.776135444641113, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2359323352575302, "step": 2645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 156.953125, "completions/mean_terminated_length": 156.953125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 3.2266219920804144, "grad_norm": 1.4796792268753052, "kl": 1.1181640625, "learning_rate": 6.740293771928717e-06, "loss": 0.0614, "num_tokens": 47639536.0, "reward": -0.238037109375, "reward_std": 0.09915121644735336, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -4.47607421875, "rewards/ppl_reward/std": 1.6353763341903687, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 149.265625, "completions/mean_terminated_length": 149.265625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.227840389887298, "grad_norm": 1.907124638557434, "kl": 2.5634765625, "learning_rate": 6.7322461777524005e-06, "loss": 0.0895, "num_tokens": 47655529.0, "reward": -1.94287109375, "reward_std": 0.8169476985931396, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.7138671875, "rewards/ppl_reward/std": 4.663269996643066, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 2647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 149.875, "completions/mean_terminated_length": 149.875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 3.229058787694182, "grad_norm": 1.6734914779663086, "kl": 5.140625, "learning_rate": 6.724200952201139e-06, "loss": 0.2031, "num_tokens": 47672353.0, "reward": -0.27740478515625, "reward_std": 0.5579161643981934, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.2579345703125, "rewards/ppl_reward/std": 1.8818774223327637, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07344620674848557, "step": 2648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 137.890625, "completions/mean_terminated_length": 137.890625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 3.230277185501066, "grad_norm": 1.681533932685852, "kl": 3.2890625, "learning_rate": 6.7161581011064955e-06, "loss": 0.0673, "num_tokens": 47687418.0, "reward": -1.3638916015625, "reward_std": 0.6241819262504578, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.563720703125, "rewards/ppl_reward/std": 3.4108448028564453, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 2649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 141.125, "completions/mean_terminated_length": 141.125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 3.23149558330795, "grad_norm": 1.9793578386306763, "kl": 5.5234375, "learning_rate": 6.70811763029831e-06, "loss": 0.1008, "num_tokens": 47703410.0, "reward": -1.79345703125, "reward_std": 1.4540873765945435, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.1181640625, "rewards/ppl_reward/std": 6.583786487579346, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18898223340511322, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 152.328125, "completions/mean_terminated_length": 152.328125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 3.232713981114834, "grad_norm": 1.787943959236145, "kl": 3.1328125, "learning_rate": 6.700079545604707e-06, "loss": 0.0324, "num_tokens": 47719647.0, "reward": -1.067138671875, "reward_std": 0.9004063010215759, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.88427734375, "rewards/ppl_reward/std": 5.062929630279541, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 2651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 139.96875, "completions/mean_terminated_length": 139.96875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 3.233932378921718, "grad_norm": 1.5235079526901245, "kl": 3.140625, "learning_rate": 6.6920438528520794e-06, "loss": 0.0148, "num_tokens": 47735373.0, "reward": -1.179931640625, "reward_std": 0.8286696672439575, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.03955078125, "rewards/ppl_reward/std": 3.4819324016571045, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 2652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 146.5625, "completions/mean_terminated_length": 146.5625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 3.235150776728602, "grad_norm": 1.2325152158737183, "kl": 2.75390625, "learning_rate": 6.684010557865077e-06, "loss": 0.0323, "num_tokens": 47751353.0, "reward": -0.6871337890625, "reward_std": 0.8248595595359802, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.085205078125, "rewards/ppl_reward/std": 3.702601194381714, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 2653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 135.265625, "completions/mean_terminated_length": 135.265625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 3.236369174535486, "grad_norm": 2.0846173763275146, "kl": 3.15625, "learning_rate": 6.6759796664666235e-06, "loss": 0.0164, "num_tokens": 47766474.0, "reward": -3.187255859375, "reward_std": 1.2639350891113281, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -10.08544921875, "rewards/ppl_reward/std": 4.650604724884033, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.20009763538837433, "step": 2654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 156.328125, "completions/mean_terminated_length": 156.328125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 3.23758757234237, "grad_norm": 1.5421503782272339, "kl": 3.853515625, "learning_rate": 6.667951184477892e-06, "loss": 0.0224, "num_tokens": 47783319.0, "reward": -0.669921875, "reward_std": 0.6774266362190247, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.93359375, "rewards/ppl_reward/std": 1.100785255432129, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 2655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 153.859375, "completions/mean_terminated_length": 153.859375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 3.2388059701492535, "grad_norm": 2.8291261196136475, "kl": 3.5078125, "learning_rate": 6.6599251177183135e-06, "loss": 0.098, "num_tokens": 47800134.0, "reward": -1.5303955078125, "reward_std": 0.5531166195869446, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.888916015625, "rewards/ppl_reward/std": 2.503694534301758, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 2656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 151.140625, "completions/mean_terminated_length": 151.140625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.2400243679561376, "grad_norm": 1.8389921188354492, "kl": 5.94140625, "learning_rate": 6.6519014720055684e-06, "loss": 0.2583, "num_tokens": 47816591.0, "reward": -1.0704345703125, "reward_std": 1.4280821084976196, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.695556640625, "rewards/ppl_reward/std": 3.959817409515381, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21931616961956024, "step": 2657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 3.2412427657630216, "grad_norm": 1.3540871143341064, "kl": 5.2890625, "learning_rate": 6.643880253155582e-06, "loss": 0.1837, "num_tokens": 47834839.0, "reward": -1.119873046875, "reward_std": 0.6552229523658752, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.81787109375, "rewards/ppl_reward/std": 2.3531363010406494, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.14412261545658112, "step": 2658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 148.640625, "completions/mean_terminated_length": 148.640625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 3.2424611635699057, "grad_norm": 3.8921453952789307, "kl": 1.91015625, "learning_rate": 6.635861466982512e-06, "loss": 0.0636, "num_tokens": 47850848.0, "reward": -2.7603759765625, "reward_std": 3.477477550506592, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -9.302001953125, "rewards/ppl_reward/std": 20.87950897216797, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 2659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 148.34375, "completions/mean_terminated_length": 148.34375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 3.2436795613767897, "grad_norm": 2.3126251697540283, "kl": 1.98046875, "learning_rate": 6.627845119298764e-06, "loss": 0.0007, "num_tokens": 47866758.0, "reward": -2.85791015625, "reward_std": 1.5551859140396118, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -9.4814453125, "rewards/ppl_reward/std": 7.244430065155029, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.2203386276960373, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 158.125, "completions/mean_terminated_length": 158.125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.2448979591836733, "grad_norm": 1.647212266921997, "kl": 3.078125, "learning_rate": 6.619831215914974e-06, "loss": 0.0533, "num_tokens": 47884638.0, "reward": -2.9610595703125, "reward_std": 0.8579857349395752, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -9.640869140625, "rewards/ppl_reward/std": 10.422300338745117, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09449111670255661, "step": 2661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 156.359375, "completions/mean_terminated_length": 156.359375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 3.2461163569905573, "grad_norm": 5.165167808532715, "kl": 7.8046875, "learning_rate": 6.611819762640004e-06, "loss": 0.3167, "num_tokens": 47902237.0, "reward": -1.33642578125, "reward_std": 1.350693702697754, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.0400390625, "rewards/ppl_reward/std": 5.3797173500061035, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.27433067560195923, "step": 2662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 136.921875, "completions/mean_terminated_length": 136.921875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 3.2473347547974414, "grad_norm": 1.632773756980896, "kl": 3.0068359375, "learning_rate": 6.603810765280937e-06, "loss": -0.0089, "num_tokens": 47917712.0, "reward": -1.5955810546875, "reward_std": 1.2338228225708008, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.886474609375, "rewards/ppl_reward/std": 5.524080276489258, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.21704266965389252, "step": 2663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 147.796875, "completions/mean_terminated_length": 147.796875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.2485531526043254, "grad_norm": 1.8777273893356323, "kl": 5.546875, "learning_rate": 6.595804229643086e-06, "loss": 0.2167, "num_tokens": 47934027.0, "reward": -0.630615234375, "reward_std": 0.67106693983078, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.89404296875, "rewards/ppl_reward/std": 2.1786766052246094, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.21704266965389252, "step": 2664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 148.953125, "completions/mean_terminated_length": 148.953125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 3.2497715504112095, "grad_norm": 2.771245241165161, "kl": 5.37890625, "learning_rate": 6.587800161529965e-06, "loss": 0.1594, "num_tokens": 47950792.0, "reward": -1.29052734375, "reward_std": 1.2339578866958618, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.9560546875, "rewards/ppl_reward/std": 3.942974328994751, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.25539806485176086, "step": 2665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 146.34375, "completions/mean_terminated_length": 146.34375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.250989948218093, "grad_norm": 2.4176597595214844, "kl": 5.35546875, "learning_rate": 6.579798566743314e-06, "loss": 0.1146, "num_tokens": 47966766.0, "reward": -3.1646728515625, "reward_std": 1.3593939542770386, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -9.790283203125, "rewards/ppl_reward/std": 10.47038745880127, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2184663861989975, "step": 2666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 156.859375, "completions/mean_terminated_length": 156.859375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 3.252208346024977, "grad_norm": 2.4040656089782715, "kl": 2.6875, "learning_rate": 6.5717994510830695e-06, "loss": 0.0009, "num_tokens": 47984301.0, "reward": -0.795166015625, "reward_std": 0.5762432217597961, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.31689453125, "rewards/ppl_reward/std": 2.0834178924560547, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 2667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 156.921875, "completions/mean_terminated_length": 156.921875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 3.253426743831861, "grad_norm": 1.426790475845337, "kl": 3.158203125, "learning_rate": 6.563802820347378e-06, "loss": 0.1323, "num_tokens": 48001432.0, "reward": -0.731201171875, "reward_std": 0.44350916147232056, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.25146484375, "rewards/ppl_reward/std": 2.5614302158355713, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 2668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 154.953125, "completions/mean_terminated_length": 154.953125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 3.254645141638745, "grad_norm": 2.389474630355835, "kl": 4.12890625, "learning_rate": 6.5558086803325774e-06, "loss": 0.0752, "num_tokens": 48018517.0, "reward": -1.74774169921875, "reward_std": 1.3951871395111084, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.0579833984375, "rewards/ppl_reward/std": 7.6084303855896, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17251639068126678, "step": 2669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 140.375, "completions/mean_terminated_length": 140.375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 3.2558635394456292, "grad_norm": 3.9790682792663574, "kl": 5.31640625, "learning_rate": 6.547817036833208e-06, "loss": 0.0865, "num_tokens": 48034797.0, "reward": -1.775634765625, "reward_std": 2.409546375274658, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.02001953125, "rewards/ppl_reward/std": 7.917418479919434, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20351573824882507, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 156.75, "completions/mean_terminated_length": 156.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 3.257081937252513, "grad_norm": 2.1741507053375244, "kl": 5.3828125, "learning_rate": 6.539827895641997e-06, "loss": 0.1224, "num_tokens": 48051605.0, "reward": -0.55889892578125, "reward_std": 1.4545053243637085, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -4.4693603515625, "rewards/ppl_reward/std": 4.912224292755127, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.224347323179245, "step": 2671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 161.40625, "completions/mean_terminated_length": 161.40625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 3.258300335059397, "grad_norm": 3.2597010135650635, "kl": 7.0, "learning_rate": 6.531841262549855e-06, "loss": 0.2182, "num_tokens": 48070287.0, "reward": -1.345458984375, "reward_std": 0.6315054893493652, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.22216796875, "rewards/ppl_reward/std": 2.985450029373169, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17817416787147522, "step": 2672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 143.9375, "completions/mean_terminated_length": 143.9375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.259518732866281, "grad_norm": 2.4501161575317383, "kl": 4.875, "learning_rate": 6.523857143345884e-06, "loss": 0.1061, "num_tokens": 48086819.0, "reward": -1.425537109375, "reward_std": 1.0125885009765625, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.467176616191864, "rewards/ppl_reward/mean": -6.06201171875, "rewards/ppl_reward/std": 2.1538493633270264, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.15465456247329712, "step": 2673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 133.578125, "completions/mean_terminated_length": 133.578125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.260737130673165, "grad_norm": 2.1548655033111572, "kl": 3.79296875, "learning_rate": 6.515875543817349e-06, "loss": 0.0594, "num_tokens": 48102408.0, "reward": -2.31591796875, "reward_std": 2.056806802749634, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -7.7958984375, "rewards/ppl_reward/std": 10.225828170776367, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.27792346477508545, "step": 2674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 146.546875, "completions/mean_terminated_length": 146.546875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 3.2619555284800485, "grad_norm": 2.3266842365264893, "kl": 2.88671875, "learning_rate": 6.5078964697497036e-06, "loss": 0.0869, "num_tokens": 48118347.0, "reward": -2.521240234375, "reward_std": 1.2654638290405273, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -8.37841796875, "rewards/ppl_reward/std": 7.3652448654174805, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.17847840487957, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 143.59375, "completions/mean_terminated_length": 143.59375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.2631739262869326, "grad_norm": 1.477843165397644, "kl": 3.7265625, "learning_rate": 6.499919926926566e-06, "loss": 0.1056, "num_tokens": 48134641.0, "reward": -1.46142578125, "reward_std": 0.7178446054458618, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.3603515625, "rewards/ppl_reward/std": 2.67047119140625, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.15430335700511932, "step": 2676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 148.515625, "completions/mean_terminated_length": 148.515625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 3.2643923240938166, "grad_norm": 2.033106565475464, "kl": 3.4296875, "learning_rate": 6.491945921129712e-06, "loss": 0.0556, "num_tokens": 48151434.0, "reward": -1.07080078125, "reward_std": 1.1923582553863525, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.7197265625, "rewards/ppl_reward/std": 4.86738920211792, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.2203386276960373, "step": 2677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 135.421875, "completions/mean_terminated_length": 135.421875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 3.2656107219007007, "grad_norm": 1.8854045867919922, "kl": 3.52734375, "learning_rate": 6.483974458139083e-06, "loss": 0.0928, "num_tokens": 48166861.0, "reward": -1.2940673828125, "reward_std": 1.1787586212158203, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.181884765625, "rewards/ppl_reward/std": 4.25479793548584, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 2678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 155.234375, "completions/mean_terminated_length": 155.234375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 3.2668291197075847, "grad_norm": 1.7313425540924072, "kl": 3.259765625, "learning_rate": 6.476005543732783e-06, "loss": 0.1083, "num_tokens": 48183900.0, "reward": -2.5054931640625, "reward_std": 1.07124924659729, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.729736328125, "rewards/ppl_reward/std": 7.054360389709473, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 2679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 149.171875, "completions/mean_terminated_length": 149.171875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 3.2680475175144683, "grad_norm": 2.151566743850708, "kl": 3.607421875, "learning_rate": 6.468039183687065e-06, "loss": 0.1963, "num_tokens": 48200191.0, "reward": -9.7821044921875, "reward_std": 1.5777318477630615, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -23.236083984375, "rewards/ppl_reward/std": 44.27954864501953, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19654127955436707, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 131.765625, "completions/mean_terminated_length": 131.765625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 3.2692659153213524, "grad_norm": 3.6069247722625732, "kl": 4.17578125, "learning_rate": 6.4600753837763255e-06, "loss": 0.0417, "num_tokens": 48214944.0, "reward": -6.9482421875, "reward_std": 2.6964221000671387, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "rewards/ppl_reward/mean": -17.185546875, "rewards/ppl_reward/std": 20.41903305053711, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2348787635564804, "step": 2681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 141.25, "completions/mean_terminated_length": 141.25, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 3.2704843131282364, "grad_norm": 1.6214020252227783, "kl": 3.7578125, "learning_rate": 6.45211414977311e-06, "loss": 0.2004, "num_tokens": 48230928.0, "reward": -11.2567138671875, "reward_std": 1.820434331893921, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -26.302490234375, "rewards/ppl_reward/std": 55.734588623046875, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 2682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 125.4375, "completions/mean_terminated_length": 125.4375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.2717027109351204, "grad_norm": 2.344581127166748, "kl": 4.5, "learning_rate": 6.444155487448109e-06, "loss": 0.1504, "num_tokens": 48245564.0, "reward": -1.212158203125, "reward_std": 0.9005843997001648, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -5.81494140625, "rewards/ppl_reward/std": 2.407987594604492, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 2683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 136.9375, "completions/mean_terminated_length": 136.9375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.272921108742004, "grad_norm": 1.6044188737869263, "kl": 3.46484375, "learning_rate": 6.436199402570138e-06, "loss": 0.0893, "num_tokens": 48261392.0, "reward": -0.2244873046875, "reward_std": 0.5775406956672668, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.144287109375, "rewards/ppl_reward/std": 1.1348867416381836, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 2684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 129.875, "completions/mean_terminated_length": 129.875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 3.274139506548888, "grad_norm": 2.0086658000946045, "kl": 2.458984375, "learning_rate": 6.428245900906156e-06, "loss": 0.0229, "num_tokens": 48276680.0, "reward": -1.024658203125, "reward_std": 0.6190070509910583, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.83837890625, "rewards/ppl_reward/std": 2.6301417350769043, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 2685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 148.796875, "completions/mean_terminated_length": 148.796875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 3.275357904355772, "grad_norm": 2.3535263538360596, "kl": 5.58984375, "learning_rate": 6.420294988221236e-06, "loss": 0.0828, "num_tokens": 48294891.0, "reward": -0.3294677734375, "reward_std": 0.9113805294036865, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.112060546875, "rewards/ppl_reward/std": 1.6998944282531738, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.21463678777217865, "step": 2686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 130.046875, "completions/mean_terminated_length": 130.046875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.276576302162656, "grad_norm": 2.8561182022094727, "kl": 4.53515625, "learning_rate": 6.412346670278584e-06, "loss": 0.1463, "num_tokens": 48309750.0, "reward": -0.75238037109375, "reward_std": 0.4388802647590637, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.2703857421875, "rewards/ppl_reward/std": 2.411677837371826, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 2687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.27779469996954, "grad_norm": 1.8758432865142822, "kl": 4.955078125, "learning_rate": 6.404400952839522e-06, "loss": 0.1403, "num_tokens": 48325422.0, "reward": -0.86187744140625, "reward_std": 0.8265683054924011, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.2003173828125, "rewards/ppl_reward/std": 2.1564390659332275, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18225978314876556, "step": 2688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 121.421875, "completions/mean_terminated_length": 121.421875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.2790130977764242, "grad_norm": 2.3845512866973877, "kl": 5.05859375, "learning_rate": 6.396457841663492e-06, "loss": 0.1417, "num_tokens": 48339433.0, "reward": -2.501708984375, "reward_std": 1.2450450658798218, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -8.51904296875, "rewards/ppl_reward/std": 5.390897750854492, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16347388923168182, "step": 2689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 149.671875, "completions/mean_terminated_length": 149.671875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 3.280231495583308, "grad_norm": 1.697907567024231, "kl": 6.70703125, "learning_rate": 6.3885173425080405e-06, "loss": 0.3895, "num_tokens": 48356644.0, "reward": -1.1787109375, "reward_std": 0.5780504941940308, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.060546875, "rewards/ppl_reward/std": 2.815166711807251, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13524486124515533, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 149.953125, "completions/mean_terminated_length": 149.953125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.281449893390192, "grad_norm": 1.7260677814483643, "kl": 4.7421875, "learning_rate": 6.38057946112882e-06, "loss": 0.1603, "num_tokens": 48374249.0, "reward": -1.6534423828125, "reward_std": 0.5074753761291504, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.010009765625, "rewards/ppl_reward/std": 4.448154926300049, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 2691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 125.0625, "completions/mean_terminated_length": 125.0625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.282668291197076, "grad_norm": 2.278330087661743, "kl": 2.4365234375, "learning_rate": 6.372644203279595e-06, "loss": 0.026, "num_tokens": 48389005.0, "reward": -0.401611328125, "reward_std": 0.29622551798820496, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.66259765625, "rewards/ppl_reward/std": 1.6851806640625, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 2692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 142.875, "completions/mean_terminated_length": 142.875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.28388668900396, "grad_norm": 2.108083486557007, "kl": 3.853515625, "learning_rate": 6.364711574712219e-06, "loss": 0.0754, "num_tokens": 48406077.0, "reward": -0.792236328125, "reward_std": 0.5733431577682495, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.24072265625, "rewards/ppl_reward/std": 1.5121723413467407, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 2693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 139.65625, "completions/mean_terminated_length": 139.65625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.2851050868108436, "grad_norm": 1.7473676204681396, "kl": 2.36328125, "learning_rate": 6.356781581176638e-06, "loss": 0.0428, "num_tokens": 48422087.0, "reward": -2.384033203125, "reward_std": 0.6038079261779785, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.59619140625, "rewards/ppl_reward/std": 5.037904262542725, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 2694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 128.34375, "completions/mean_terminated_length": 128.34375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 3.2863234846177276, "grad_norm": 4.017602443695068, "kl": 7.9140625, "learning_rate": 6.348854228420897e-06, "loss": 0.2379, "num_tokens": 48437093.0, "reward": -1.5482177734375, "reward_std": 1.323335886001587, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -6.315185546875, "rewards/ppl_reward/std": 2.962148904800415, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.26726123690605164, "step": 2695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 128.40625, "completions/mean_terminated_length": 128.40625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 3.2875418824246117, "grad_norm": 1.6010010242462158, "kl": 4.78125, "learning_rate": 6.3409295221911235e-06, "loss": 0.14, "num_tokens": 48451951.0, "reward": -1.22064208984375, "reward_std": 0.7768957614898682, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -5.9412841796875, "rewards/ppl_reward/std": 3.834686517715454, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.125, "step": 2696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 149.4375, "completions/mean_terminated_length": 149.4375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.2887602802314957, "grad_norm": 2.5102484226226807, "kl": 6.97265625, "learning_rate": 6.333007468231521e-06, "loss": 0.4008, "num_tokens": 48468459.0, "reward": -1.7030029296875, "reward_std": 1.6804633140563965, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.968505859375, "rewards/ppl_reward/std": 5.592848300933838, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.22658175230026245, "step": 2697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 137.390625, "completions/mean_terminated_length": 137.390625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.2899786780383797, "grad_norm": 2.8137409687042236, "kl": 4.501953125, "learning_rate": 6.3250880722843775e-06, "loss": 0.1589, "num_tokens": 48484164.0, "reward": -2.00537109375, "reward_std": 1.2000415325164795, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.7294921875, "rewards/ppl_reward/std": 4.111597537994385, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 2698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 119.15625, "completions/mean_terminated_length": 119.15625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.2911970758452633, "grad_norm": 1.6615098714828491, "kl": 3.0390625, "learning_rate": 6.317171340090053e-06, "loss": 0.0506, "num_tokens": 48498470.0, "reward": -2.447021484375, "reward_std": 1.5808985233306885, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.54248046875, "rewards/ppl_reward/std": 4.1645026206970215, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 2699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 144.765625, "completions/mean_terminated_length": 144.765625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.2924154736521474, "grad_norm": 1.640194296836853, "kl": 3.33984375, "learning_rate": 6.309257277386975e-06, "loss": 0.13, "num_tokens": 48515119.0, "reward": -0.643798828125, "reward_std": 0.4571470022201538, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.98291015625, "rewards/ppl_reward/std": 1.4454246759414673, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 145.640625, "completions/mean_terminated_length": 145.640625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 3.2936338714590314, "grad_norm": 1.611694097518921, "kl": 4.76171875, "learning_rate": 6.301345889911636e-06, "loss": 0.2194, "num_tokens": 48531800.0, "reward": -1.950439453125, "reward_std": 0.9561586976051331, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.45556640625, "rewards/ppl_reward/std": 5.111971378326416, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 2701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 133.5, "completions/mean_terminated_length": 133.5, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 3.2948522692659155, "grad_norm": 1.7622346878051758, "kl": 4.146484375, "learning_rate": 6.293437183398592e-06, "loss": 0.148, "num_tokens": 48547520.0, "reward": -1.282470703125, "reward_std": 0.7358605861663818, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.29931640625, "rewards/ppl_reward/std": 4.860694408416748, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1423913538455963, "step": 2702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 141.09375, "completions/mean_terminated_length": 141.09375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 3.296070667072799, "grad_norm": 1.8493056297302246, "kl": 5.8203125, "learning_rate": 6.2855311635804495e-06, "loss": 0.2502, "num_tokens": 48563718.0, "reward": -0.4476318359375, "reward_std": 0.519149661064148, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.528076171875, "rewards/ppl_reward/std": 1.3550626039505005, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 2703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 128.78125, "completions/mean_terminated_length": 128.78125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 3.297289064879683, "grad_norm": 2.296809196472168, "kl": 5.30078125, "learning_rate": 6.277627836187874e-06, "loss": 0.2545, "num_tokens": 48578664.0, "reward": -0.66748046875, "reward_std": 0.739973783493042, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.9521484375, "rewards/ppl_reward/std": 2.582009792327881, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15545432269573212, "step": 2704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 3.298507462686567, "grad_norm": 2.6897358894348145, "kl": 6.2734375, "learning_rate": 6.269727206949577e-06, "loss": 0.2349, "num_tokens": 48593496.0, "reward": -2.3123779296875, "reward_std": 1.4900822639465332, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.202880859375, "rewards/ppl_reward/std": 6.478533744812012, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.14412261545658112, "step": 2705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 124.53125, "completions/mean_terminated_length": 124.53125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.299725860493451, "grad_norm": 3.4484074115753174, "kl": 5.453125, "learning_rate": 6.261829281592313e-06, "loss": 0.1233, "num_tokens": 48608234.0, "reward": -1.087890625, "reward_std": 1.0916199684143066, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.73828125, "rewards/ppl_reward/std": 3.0717990398406982, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17251639068126678, "step": 2706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 132.890625, "completions/mean_terminated_length": 132.890625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 3.300944258300335, "grad_norm": 2.9961130619049072, "kl": 4.646484375, "learning_rate": 6.25393406584088e-06, "loss": 0.1786, "num_tokens": 48623707.0, "reward": -1.2926025390625, "reward_std": 0.6215507984161377, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.178955078125, "rewards/ppl_reward/std": 2.874525547027588, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 2707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 131.703125, "completions/mean_terminated_length": 131.703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 3.302162656107219, "grad_norm": 2.8403570652008057, "kl": 5.734375, "learning_rate": 6.246041565418111e-06, "loss": 0.2277, "num_tokens": 48639256.0, "reward": -1.19378662109375, "reward_std": 1.1726057529449463, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.9578857421875, "rewards/ppl_reward/std": 4.677974224090576, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.23435020446777344, "step": 2708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 142.875, "completions/mean_terminated_length": 142.875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 3.303381053914103, "grad_norm": 2.128351926803589, "kl": 6.0625, "learning_rate": 6.238151786044866e-06, "loss": 0.2625, "num_tokens": 48654976.0, "reward": -1.244384765625, "reward_std": 0.7834470272064209, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.05908203125, "rewards/ppl_reward/std": 3.698728322982788, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18225978314876556, "step": 2709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 120.59375, "completions/mean_terminated_length": 120.59375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.304599451720987, "grad_norm": 3.412139892578125, "kl": 2.400390625, "learning_rate": 6.230264733440037e-06, "loss": 0.051, "num_tokens": 48669294.0, "reward": -2.27081298828125, "reward_std": 0.5666606426239014, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.3619384765625, "rewards/ppl_reward/std": 7.312097549438477, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 138.484375, "completions/mean_terminated_length": 138.484375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 3.305817849527871, "grad_norm": 1.737237572669983, "kl": 4.34375, "learning_rate": 6.222380413320546e-06, "loss": 0.0996, "num_tokens": 48685381.0, "reward": -1.7364501953125, "reward_std": 1.4978547096252441, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.043212890625, "rewards/ppl_reward/std": 4.784954071044922, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2257249802350998, "step": 2711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 136.84375, "completions/mean_terminated_length": 136.84375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 3.307036247334755, "grad_norm": 2.7256557941436768, "kl": 3.828125, "learning_rate": 6.214498831401317e-06, "loss": 0.1329, "num_tokens": 48701835.0, "reward": -1.5155029296875, "reward_std": 0.733777642250061, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.741943359375, "rewards/ppl_reward/std": 4.313453674316406, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10789459943771362, "step": 2712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 149.21875, "completions/mean_terminated_length": 149.21875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 3.3082546451416386, "grad_norm": 3.8879554271698, "kl": 5.85546875, "learning_rate": 6.2066199933953035e-06, "loss": 0.2113, "num_tokens": 48719369.0, "reward": -1.22314453125, "reward_std": 0.6228975653648376, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.1103515625, "rewards/ppl_reward/std": 4.1094889640808105, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14471296966075897, "step": 2713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 142.375, "completions/mean_terminated_length": 142.375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.3094730429485226, "grad_norm": 1.9984564781188965, "kl": 4.583984375, "learning_rate": 6.198743905013464e-06, "loss": 0.2376, "num_tokens": 48735585.0, "reward": -1.52197265625, "reward_std": 0.4958896040916443, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.8173828125, "rewards/ppl_reward/std": 3.7070152759552, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 2714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 131.5625, "completions/mean_terminated_length": 131.5625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 3.3106914407554067, "grad_norm": 1.749413013458252, "kl": 3.6767578125, "learning_rate": 6.1908705719647735e-06, "loss": 0.1138, "num_tokens": 48751421.0, "reward": -2.167236328125, "reward_std": 0.5169351100921631, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -8.20166015625, "rewards/ppl_reward/std": 6.13909912109375, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 2715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 118.203125, "completions/mean_terminated_length": 118.203125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.3119098385622907, "grad_norm": 2.6881303787231445, "kl": 6.2421875, "learning_rate": 6.182999999956194e-06, "loss": 0.3072, "num_tokens": 48765178.0, "reward": -2.0604248046875, "reward_std": 0.9787338972091675, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.652099609375, "rewards/ppl_reward/std": 5.789997100830078, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17817416787147522, "step": 2716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 141.953125, "completions/mean_terminated_length": 141.953125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.3131282363691748, "grad_norm": 3.635521411895752, "kl": 6.63671875, "learning_rate": 6.175132194692699e-06, "loss": 0.2723, "num_tokens": 48781807.0, "reward": -0.8538818359375, "reward_std": 0.7559654116630554, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.356201171875, "rewards/ppl_reward/std": 2.452361583709717, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 2717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 137.921875, "completions/mean_terminated_length": 137.921875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.3143466341760583, "grad_norm": 2.2549641132354736, "kl": 6.17578125, "learning_rate": 6.167267161877248e-06, "loss": 0.2888, "num_tokens": 48797426.0, "reward": -1.997802734375, "reward_std": 0.7901788949966431, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.72216796875, "rewards/ppl_reward/std": 3.325239658355713, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 2718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 132.671875, "completions/mean_terminated_length": 132.671875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.3155650319829424, "grad_norm": 1.8844749927520752, "kl": 3.86328125, "learning_rate": 6.159404907210798e-06, "loss": 0.1542, "num_tokens": 48812437.0, "reward": -1.29833984375, "reward_std": 0.7595661282539368, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.3544921875, "rewards/ppl_reward/std": 4.788666248321533, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 2719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 132.421875, "completions/mean_terminated_length": 132.421875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 3.3167834297898264, "grad_norm": 2.353618860244751, "kl": 6.32421875, "learning_rate": 6.151545436392292e-06, "loss": 0.2334, "num_tokens": 48827840.0, "reward": -1.415771484375, "reward_std": 1.018110752105713, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.47998046875, "rewards/ppl_reward/std": 5.058008193969727, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21931616961956024, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 156.71875, "completions/mean_terminated_length": 142.952392578125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 3.3180018275967105, "grad_norm": 2.737572193145752, "kl": 6.28515625, "learning_rate": 6.1436887551186466e-06, "loss": 0.4194, "num_tokens": 48845334.0, "reward": -0.702880859375, "reward_std": 0.41195932030677795, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.20263671875, "rewards/ppl_reward/std": 2.984013319015503, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 2721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 131.671875, "completions/mean_terminated_length": 131.671875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 3.319220225403594, "grad_norm": 1.8858273029327393, "kl": 3.8046875, "learning_rate": 6.135834869084762e-06, "loss": 0.1071, "num_tokens": 48860249.0, "reward": -0.417236328125, "reward_std": 0.7928628325462341, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.43603515625, "rewards/ppl_reward/std": 1.8259556293487549, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1979166716337204, "step": 2722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 136.171875, "completions/mean_terminated_length": 136.171875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.320438623210478, "grad_norm": 2.495424509048462, "kl": 6.04296875, "learning_rate": 6.127983783983514e-06, "loss": 0.247, "num_tokens": 48875996.0, "reward": -1.5709228515625, "reward_std": 0.6622997522354126, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.719970703125, "rewards/ppl_reward/std": 3.978644847869873, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15570341050624847, "step": 2723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 134.484375, "completions/mean_terminated_length": 134.484375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.321657021017362, "grad_norm": 2.170384168624878, "kl": 4.0703125, "learning_rate": 6.1201355055057486e-06, "loss": 0.1064, "num_tokens": 48891451.0, "reward": -0.21826171875, "reward_std": 0.7620024681091309, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.1396484375, "rewards/ppl_reward/std": 1.722015380859375, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19654127955436707, "step": 2724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 108.609375, "completions/mean_terminated_length": 108.609375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 3.322875418824246, "grad_norm": 2.3763163089752197, "kl": 3.2890625, "learning_rate": 6.112290039340274e-06, "loss": 0.0204, "num_tokens": 48904450.0, "reward": -2.013671875, "reward_std": 0.9202142953872681, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.70703125, "rewards/ppl_reward/std": 5.89168643951416, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17354662716388702, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 129.984375, "completions/mean_terminated_length": 129.984375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 3.3240938166311302, "grad_norm": 3.070330858230591, "kl": 6.84375, "learning_rate": 6.104447391173859e-06, "loss": 0.3002, "num_tokens": 48918977.0, "reward": -0.6986083984375, "reward_std": 1.641679286956787, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -4.920654296875, "rewards/ppl_reward/std": 4.853825569152832, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17390352487564087, "step": 2726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 125.703125, "completions/mean_terminated_length": 125.703125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 3.325312214438014, "grad_norm": 3.2637150287628174, "kl": 7.5, "learning_rate": 6.096607566691235e-06, "loss": 0.2384, "num_tokens": 48933222.0, "reward": -1.3919677734375, "reward_std": 1.2625435590744019, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.283935546875, "rewards/ppl_reward/std": 4.013247489929199, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.2136233150959015, "step": 2727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 130.578125, "completions/mean_terminated_length": 130.578125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 3.326530612244898, "grad_norm": 2.2948248386383057, "kl": 5.1484375, "learning_rate": 6.088770571575082e-06, "loss": 0.2216, "num_tokens": 48948507.0, "reward": -3.4808349609375, "reward_std": 1.3485649824142456, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -10.649169921875, "rewards/ppl_reward/std": 10.005367279052734, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 2728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 116.828125, "completions/mean_terminated_length": 116.828125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.327749010051782, "grad_norm": 1.7816574573516846, "kl": 3.296875, "learning_rate": 6.080936411506036e-06, "loss": 0.1169, "num_tokens": 48962296.0, "reward": -1.5931396484375, "reward_std": 0.7278931736946106, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.928466796875, "rewards/ppl_reward/std": 3.134652614593506, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15344709157943726, "step": 2729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 143.765625, "completions/mean_terminated_length": 143.765625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 3.328967407858666, "grad_norm": 2.1938836574554443, "kl": 5.078125, "learning_rate": 6.0731050921626614e-06, "loss": 0.218, "num_tokens": 48978857.0, "reward": -0.2645263671875, "reward_std": 0.6267048120498657, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.232177734375, "rewards/ppl_reward/std": 1.789285659790039, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.21578919887542725, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 135.34375, "completions/mean_terminated_length": 135.34375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.3301858056655496, "grad_norm": 2.9372329711914062, "kl": 6.478515625, "learning_rate": 6.065276619221485e-06, "loss": 0.4109, "num_tokens": 48994495.0, "reward": -0.81805419921875, "reward_std": 0.6559127569198608, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.4173583984375, "rewards/ppl_reward/std": 4.080341815948486, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 2731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 139.125, "completions/mean_terminated_length": 139.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.3314042034724336, "grad_norm": 2.458702802658081, "kl": 5.9296875, "learning_rate": 6.057450998356955e-06, "loss": 0.1482, "num_tokens": 49010615.0, "reward": -1.29296875, "reward_std": 1.1503300666809082, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.0625, "rewards/ppl_reward/std": 4.4035749435424805, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2722889184951782, "step": 2732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 133.1875, "completions/mean_terminated_length": 133.1875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.3326226012793176, "grad_norm": 2.305715560913086, "kl": 3.90234375, "learning_rate": 6.049628235241459e-06, "loss": 0.1176, "num_tokens": 49026099.0, "reward": -1.63232421875, "reward_std": 0.782080352306366, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.0224609375, "rewards/ppl_reward/std": 2.9096665382385254, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 2733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 132.3125, "completions/mean_terminated_length": 132.3125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.3338409990862017, "grad_norm": 1.6420574188232422, "kl": 2.775390625, "learning_rate": 6.041808335545315e-06, "loss": 0.0507, "num_tokens": 49042135.0, "reward": -0.345947265625, "reward_std": 0.3266380727291107, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.48095703125, "rewards/ppl_reward/std": 2.0228142738342285, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 2734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 131.125, "completions/mean_terminated_length": 131.125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 3.3350593968930857, "grad_norm": 2.1964006423950195, "kl": 4.68359375, "learning_rate": 6.033991304936757e-06, "loss": 0.1477, "num_tokens": 49057759.0, "reward": -0.23779296875, "reward_std": 0.6062828302383423, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.2099609375, "rewards/ppl_reward/std": 2.107419967651367, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15570341050624847, "step": 2735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 117.515625, "completions/mean_terminated_length": 117.515625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 3.3362777946999698, "grad_norm": 1.3254026174545288, "kl": 4.69140625, "learning_rate": 6.026177149081949e-06, "loss": 0.0732, "num_tokens": 49072056.0, "reward": -7.13037109375, "reward_std": 3.2278823852539062, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -17.7451171875, "rewards/ppl_reward/std": 27.801925659179688, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2366211861371994, "step": 2736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 117.984375, "completions/mean_terminated_length": 117.984375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 3.3374961925068534, "grad_norm": 2.946878671646118, "kl": 2.1181640625, "learning_rate": 6.018365873644962e-06, "loss": 0.0679, "num_tokens": 49086247.0, "reward": -0.9866943359375, "reward_std": 0.21701106429100037, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -5.926513671875, "rewards/ppl_reward/std": 4.03633975982666, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 2737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 128.109375, "completions/mean_terminated_length": 128.109375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 3.3387145903137374, "grad_norm": 1.6521785259246826, "kl": 2.30859375, "learning_rate": 6.010557484287791e-06, "loss": 0.0395, "num_tokens": 49101206.0, "reward": -0.77264404296875, "reward_std": 0.5247480273246765, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": -5.4437255859375, "rewards/ppl_reward/std": 2.9424471855163574, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 2738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 118.671875, "completions/mean_terminated_length": 118.671875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 3.3399329881206214, "grad_norm": 1.9058891534805298, "kl": 3.41015625, "learning_rate": 6.002751986670323e-06, "loss": 0.1133, "num_tokens": 49115617.0, "reward": -3.40234375, "reward_std": 0.9697099328041077, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -10.671875, "rewards/ppl_reward/std": 7.166099548339844, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 2739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 120.3125, "completions/mean_terminated_length": 120.3125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.3411513859275055, "grad_norm": 2.459683895111084, "kl": 4.1484375, "learning_rate": 5.994949386450366e-06, "loss": 0.0583, "num_tokens": 49129525.0, "reward": -3.132080078125, "reward_std": 2.2272064685821533, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -9.95166015625, "rewards/ppl_reward/std": 9.230669975280762, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 139.640625, "completions/mean_terminated_length": 139.640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 3.342369783734389, "grad_norm": 3.3716368675231934, "kl": 4.6640625, "learning_rate": 5.987149689283614e-06, "loss": 0.146, "num_tokens": 49146334.0, "reward": -7.9788818359375, "reward_std": 1.8408135175704956, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -19.629638671875, "rewards/ppl_reward/std": 28.203014373779297, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 2741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 131.84375, "completions/mean_terminated_length": 131.84375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 3.343588181541273, "grad_norm": 3.1444718837738037, "kl": 6.2578125, "learning_rate": 5.979352900823663e-06, "loss": 0.3127, "num_tokens": 49161556.0, "reward": -1.0447998046875, "reward_std": 1.0015997886657715, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.613037109375, "rewards/ppl_reward/std": 2.540083169937134, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2184663861989975, "step": 2742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 124.6875, "completions/mean_terminated_length": 124.6875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.344806579348157, "grad_norm": 1.808229684829712, "kl": 3.10546875, "learning_rate": 5.971559026722005e-06, "loss": 0.0714, "num_tokens": 49176296.0, "reward": -0.61474609375, "reward_std": 0.4488148093223572, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.9951171875, "rewards/ppl_reward/std": 2.237462282180786, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 2743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 139.78125, "completions/mean_terminated_length": 139.78125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.346024977155041, "grad_norm": 1.942673921585083, "kl": 4.0576171875, "learning_rate": 5.963768072628009e-06, "loss": 0.1429, "num_tokens": 49192778.0, "reward": -1.0552978515625, "reward_std": 0.5827010273933411, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.782470703125, "rewards/ppl_reward/std": 1.6489392518997192, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 2744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 120.5625, "completions/mean_terminated_length": 120.5625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.3472433749619253, "grad_norm": 2.8392932415008545, "kl": 3.96875, "learning_rate": 5.955980044188935e-06, "loss": 0.062, "num_tokens": 49207222.0, "reward": -2.20751953125, "reward_std": 1.5692561864852905, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.0791015625, "rewards/ppl_reward/std": 6.3322062492370605, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.2215663492679596, "step": 2745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 127.3125, "completions/mean_terminated_length": 127.3125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.348461772768809, "grad_norm": 2.3911995887756348, "kl": 8.5625, "learning_rate": 5.9481949470499255e-06, "loss": 0.4215, "num_tokens": 49222162.0, "reward": -1.5758056640625, "reward_std": 1.2878119945526123, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.721923828125, "rewards/ppl_reward/std": 6.956003189086914, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1526367962360382, "step": 2746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 129.640625, "completions/mean_terminated_length": 129.640625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 3.349680170575693, "grad_norm": 3.5062947273254395, "kl": 4.37890625, "learning_rate": 5.940412786853985e-06, "loss": 0.1276, "num_tokens": 49237835.0, "reward": -0.7900390625, "reward_std": 0.543891429901123, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.298828125, "rewards/ppl_reward/std": 3.849944829940796, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 2747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 114.703125, "completions/mean_terminated_length": 114.703125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.350898568382577, "grad_norm": 4.154664516448975, "kl": 8.3984375, "learning_rate": 5.932633569242e-06, "loss": 0.2287, "num_tokens": 49252040.0, "reward": -0.8043212890625, "reward_std": 0.9315625429153442, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.467176616191864, "rewards/ppl_reward/mean": -4.757080078125, "rewards/ppl_reward/std": 2.0882251262664795, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.21789801120758057, "step": 2748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 126.796875, "completions/mean_terminated_length": 126.796875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.352116966189461, "grad_norm": 4.412173271179199, "kl": 10.1875, "learning_rate": 5.9248572998527225e-06, "loss": 0.3414, "num_tokens": 49267963.0, "reward": -1.1812744140625, "reward_std": 1.1447312831878662, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -5.581298828125, "rewards/ppl_reward/std": 3.0202794075012207, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2221602201461792, "step": 2749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 157.59375, "completions/mean_terminated_length": 157.59375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.3533353639963446, "grad_norm": 3.2822957038879395, "kl": 10.453125, "learning_rate": 5.917083984322765e-06, "loss": 0.5449, "num_tokens": 49287217.0, "reward": -0.2685546875, "reward_std": 0.5904902815818787, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "rewards/ppl_reward/mean": -3.841796875, "rewards/ppl_reward/std": 1.567834496498108, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1354166716337204, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 123.96875, "completions/mean_terminated_length": 123.96875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.3545537618032286, "grad_norm": 3.8495569229125977, "kl": 9.53125, "learning_rate": 5.9093136282866014e-06, "loss": 0.4374, "num_tokens": 49302815.0, "reward": -2.6639404296875, "reward_std": 1.413076639175415, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -8.726318359375, "rewards/ppl_reward/std": 7.841933727264404, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.2582649290561676, "step": 2751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 116.03125, "completions/mean_terminated_length": 116.03125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 3.3557721596101127, "grad_norm": 2.718205213546753, "kl": 4.1328125, "learning_rate": 5.901546237376561e-06, "loss": 0.0815, "num_tokens": 49317161.0, "reward": -2.167724609375, "reward_std": 0.5920048952102661, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.92138671875, "rewards/ppl_reward/std": 7.435850143432617, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17390352487564087, "step": 2752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 123.84375, "completions/mean_terminated_length": 123.84375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.3569905574169967, "grad_norm": 2.202258825302124, "kl": 4.619140625, "learning_rate": 5.893781817222818e-06, "loss": 0.1722, "num_tokens": 49332303.0, "reward": -4.99365234375, "reward_std": 4.4102396965026855, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -13.6591796875, "rewards/ppl_reward/std": 26.564504623413086, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 2753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 124.90625, "completions/mean_terminated_length": 124.90625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 3.3582089552238807, "grad_norm": 3.6429309844970703, "kl": 3.0234375, "learning_rate": 5.886020373453398e-06, "loss": 0.1612, "num_tokens": 49347681.0, "reward": -0.9298095703125, "reward_std": 0.40789586305618286, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.703369140625, "rewards/ppl_reward/std": 3.484748601913452, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 2754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 119.75, "completions/mean_terminated_length": 119.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.359427353030765, "grad_norm": 2.3481783866882324, "kl": 4.79296875, "learning_rate": 5.878261911694176e-06, "loss": 0.1594, "num_tokens": 49362049.0, "reward": -1.2041015625, "reward_std": 0.9523513317108154, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.048828125, "rewards/ppl_reward/std": 4.066959381103516, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18617255985736847, "step": 2755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 118.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.3606457508376484, "grad_norm": 1.6501035690307617, "kl": 4.0625, "learning_rate": 5.870506437568851e-06, "loss": 0.0804, "num_tokens": 49376729.0, "reward": -0.87841796875, "reward_std": 0.8741946220397949, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.2646484375, "rewards/ppl_reward/std": 3.7032370567321777, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.20740120112895966, "step": 2756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 124.765625, "completions/mean_terminated_length": 124.765625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.3618641486445324, "grad_norm": 1.9823596477508545, "kl": 4.65234375, "learning_rate": 5.862753956698959e-06, "loss": 0.1462, "num_tokens": 49391794.0, "reward": -0.88690185546875, "reward_std": 0.6462997198104858, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.3441162109375, "rewards/ppl_reward/std": 3.2557947635650635, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.17102740705013275, "step": 2757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 125.546875, "completions/mean_terminated_length": 125.546875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 3.3630825464514165, "grad_norm": 2.7315688133239746, "kl": 7.603515625, "learning_rate": 5.855004474703878e-06, "loss": 0.459, "num_tokens": 49406813.0, "reward": -3.1640625, "reward_std": 1.1270523071289062, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -9.8515625, "rewards/ppl_reward/std": 10.667824745178223, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.16810208559036255, "step": 2758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 126.546875, "completions/mean_terminated_length": 126.546875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 3.3643009442583005, "grad_norm": 2.668421745300293, "kl": 3.37109375, "learning_rate": 5.847257997200806e-06, "loss": 0.2096, "num_tokens": 49421912.0, "reward": -2.7613525390625, "reward_std": 0.526294469833374, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -9.249267578125, "rewards/ppl_reward/std": 7.714885711669922, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 2759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 114.984375, "completions/mean_terminated_length": 114.984375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 3.365519342065184, "grad_norm": 1.9718713760375977, "kl": 3.548828125, "learning_rate": 5.839514529804757e-06, "loss": 0.0267, "num_tokens": 49436479.0, "reward": -2.43603515625, "reward_std": 0.8486965298652649, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.5830078125, "rewards/ppl_reward/std": 6.922186851501465, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1791718602180481, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 111.609375, "completions/mean_terminated_length": 111.609375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 3.366737739872068, "grad_norm": 2.1626601219177246, "kl": 2.0, "learning_rate": 5.831774078128574e-06, "loss": -0.026, "num_tokens": 49450046.0, "reward": -1.5289306640625, "reward_std": 0.4688301384449005, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.885986328125, "rewards/ppl_reward/std": 3.5509822368621826, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 2761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 116.40625, "completions/mean_terminated_length": 116.40625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 3.367956137678952, "grad_norm": 1.6107218265533447, "kl": 3.54296875, "learning_rate": 5.8240366477829025e-06, "loss": 0.0484, "num_tokens": 49464896.0, "reward": -3.3797607421875, "reward_std": 1.9123482704162598, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -10.275146484375, "rewards/ppl_reward/std": 7.411552429199219, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 2762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 117.625, "completions/mean_terminated_length": 117.625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 3.3691745354858362, "grad_norm": 2.1194710731506348, "kl": 4.8515625, "learning_rate": 5.816302244376206e-06, "loss": 0.1267, "num_tokens": 49479264.0, "reward": -2.143798828125, "reward_std": 0.7137589454650879, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.93603515625, "rewards/ppl_reward/std": 5.3222880363464355, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 2763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 128.109375, "completions/mean_terminated_length": 128.109375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 3.3703929332927203, "grad_norm": 2.5476152896881104, "kl": 6.2734375, "learning_rate": 5.808570873514758e-06, "loss": 0.316, "num_tokens": 49494055.0, "reward": -1.015625, "reward_std": 0.9257046580314636, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.609375, "rewards/ppl_reward/std": 2.539578676223755, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16347388923168182, "step": 2764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 122.671875, "completions/mean_terminated_length": 122.671875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.371611331099604, "grad_norm": 2.0887444019317627, "kl": 5.25390625, "learning_rate": 5.800842540802618e-06, "loss": 0.2338, "num_tokens": 49509122.0, "reward": -1.359130859375, "reward_std": 0.9890241622924805, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.23388671875, "rewards/ppl_reward/std": 3.8931009769439697, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 2765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 129.125, "completions/mean_terminated_length": 129.125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 3.372829728906488, "grad_norm": 2.667534112930298, "kl": 4.5, "learning_rate": 5.793117251841659e-06, "loss": 0.1753, "num_tokens": 49525122.0, "reward": -1.406982421875, "reward_std": 0.4489530026912689, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.57177734375, "rewards/ppl_reward/std": 3.5120155811309814, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 2766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 143.109375, "completions/mean_terminated_length": 143.109375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 3.374048126713372, "grad_norm": 1.9782240390777588, "kl": 4.82421875, "learning_rate": 5.785395012231543e-06, "loss": 0.1301, "num_tokens": 49541153.0, "reward": -0.725341796875, "reward_std": 0.8102187514305115, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.03662109375, "rewards/ppl_reward/std": 2.455993175506592, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17951758205890656, "step": 2767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 113.9375, "completions/mean_terminated_length": 113.9375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.375266524520256, "grad_norm": 2.8554434776306152, "kl": 4.623046875, "learning_rate": 5.7776758275697155e-06, "loss": 0.1347, "num_tokens": 49554573.0, "reward": -1.0318603515625, "reward_std": 0.41913744807243347, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.743408203125, "rewards/ppl_reward/std": 3.3311328887939453, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 2768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 128.96875, "completions/mean_terminated_length": 128.96875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.3764849223271396, "grad_norm": 2.1373484134674072, "kl": 4.529296875, "learning_rate": 5.769959703451413e-06, "loss": 0.0832, "num_tokens": 49569811.0, "reward": -2.2060546875, "reward_std": 1.3944728374481201, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.998046875, "rewards/ppl_reward/std": 6.795579433441162, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.13495801389217377, "step": 2769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 121.296875, "completions/mean_terminated_length": 121.296875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.3777033201340236, "grad_norm": 2.0624523162841797, "kl": 4.4658203125, "learning_rate": 5.762246645469655e-06, "loss": 0.1538, "num_tokens": 49584286.0, "reward": -1.38934326171875, "reward_std": 0.46892523765563965, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.5364990234375, "rewards/ppl_reward/std": 4.829164028167725, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 122.65625, "completions/mean_terminated_length": 122.65625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.3789217179409077, "grad_norm": 1.5652847290039062, "kl": 3.1220703125, "learning_rate": 5.754536659215239e-06, "loss": 0.0742, "num_tokens": 49599296.0, "reward": -1.583984375, "reward_std": 0.7322323322296143, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.92578125, "rewards/ppl_reward/std": 3.2183382511138916, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 2771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 127.71875, "completions/mean_terminated_length": 127.71875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.3801401157477917, "grad_norm": 3.6932196617126465, "kl": 7.3125, "learning_rate": 5.74682975027673e-06, "loss": 0.2359, "num_tokens": 49615270.0, "reward": -0.72027587890625, "reward_std": 0.9138922691345215, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.9952392578125, "rewards/ppl_reward/std": 2.8203201293945312, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19024936854839325, "step": 2772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 121.71875, "completions/mean_terminated_length": 121.71875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 3.3813585135546758, "grad_norm": 1.868760347366333, "kl": 3.6015625, "learning_rate": 5.739125924240472e-06, "loss": 0.0955, "num_tokens": 49630084.0, "reward": -1.25439453125, "reward_std": 0.7777390480041504, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.1884765625, "rewards/ppl_reward/std": 2.5198583602905273, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1677328199148178, "step": 2773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 3.3825769113615594, "grad_norm": 2.006605863571167, "kl": 5.609375, "learning_rate": 5.731425186690564e-06, "loss": 0.1367, "num_tokens": 49644788.0, "reward": -0.790283203125, "reward_std": 1.0806612968444824, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.13525390625, "rewards/ppl_reward/std": 4.558717250823975, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1953943371772766, "step": 2774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 141.859375, "completions/mean_terminated_length": 141.859375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.3837953091684434, "grad_norm": 3.069587230682373, "kl": 6.76953125, "learning_rate": 5.723727543208867e-06, "loss": 0.2259, "num_tokens": 49661979.0, "reward": -0.76898193359375, "reward_std": 0.9816839694976807, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.0770263671875, "rewards/ppl_reward/std": 1.9326064586639404, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.25341787934303284, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 131.40625, "completions/mean_terminated_length": 131.40625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.3850137069753274, "grad_norm": 7.836112022399902, "kl": 9.828125, "learning_rate": 5.716032999375006e-06, "loss": 0.3779, "num_tokens": 49678221.0, "reward": -2.9732666015625, "reward_std": 1.7491053342819214, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -9.376220703125, "rewards/ppl_reward/std": 7.266331195831299, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.2382858246564865, "step": 2776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 124.09375, "completions/mean_terminated_length": 124.09375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 3.3862321047822115, "grad_norm": 3.3689913749694824, "kl": 5.87890625, "learning_rate": 5.708341560766357e-06, "loss": 0.2288, "num_tokens": 49693035.0, "reward": -2.0855712890625, "reward_std": 0.9404774308204651, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.843017578125, "rewards/ppl_reward/std": 6.929709434509277, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1423913538455963, "step": 2777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 116.578125, "completions/mean_terminated_length": 116.578125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.3874505025890955, "grad_norm": 3.7537081241607666, "kl": 7.80859375, "learning_rate": 5.700653232958047e-06, "loss": 0.2665, "num_tokens": 49707176.0, "reward": -2.62353515625, "reward_std": 1.2536085844039917, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.7314453125, "rewards/ppl_reward/std": 4.59653902053833, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.21921011805534363, "step": 2778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 124.40625, "completions/mean_terminated_length": 124.40625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 3.388668900395979, "grad_norm": 1.6937191486358643, "kl": 2.8984375, "learning_rate": 5.692968021522944e-06, "loss": 0.1292, "num_tokens": 49721666.0, "reward": -1.7896728515625, "reward_std": 0.3908161520957947, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.407470703125, "rewards/ppl_reward/std": 5.114334583282471, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11545931547880173, "step": 2779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 116.078125, "completions/mean_terminated_length": 116.078125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.389887298202863, "grad_norm": 2.294351100921631, "kl": 4.5703125, "learning_rate": 5.685285932031654e-06, "loss": 0.147, "num_tokens": 49735639.0, "reward": -1.3681640625, "reward_std": 0.6346081495285034, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.408203125, "rewards/ppl_reward/std": 3.4313035011291504, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18617255985736847, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 137.921875, "completions/mean_terminated_length": 137.921875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.391105696009747, "grad_norm": 1.5569748878479004, "kl": 2.783203125, "learning_rate": 5.67760697005253e-06, "loss": 0.0501, "num_tokens": 49751618.0, "reward": -2.27001953125, "reward_std": 1.652556300163269, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.2353515625, "rewards/ppl_reward/std": 9.02080249786377, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1979166716337204, "step": 2781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 121.453125, "completions/mean_terminated_length": 121.453125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.3923240938166312, "grad_norm": 1.9527252912521362, "kl": 2.83984375, "learning_rate": 5.669931141151654e-06, "loss": 0.0431, "num_tokens": 49766615.0, "reward": -0.9072265625, "reward_std": 0.8665384650230408, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.556640625, "rewards/ppl_reward/std": 2.6110928058624268, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.19507674872875214, "step": 2782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 123.46875, "completions/mean_terminated_length": 123.46875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 3.3935424916235153, "grad_norm": 2.3420419692993164, "kl": 5.615234375, "learning_rate": 5.66225845089283e-06, "loss": 0.1665, "num_tokens": 49781085.0, "reward": -3.1556396484375, "reward_std": 2.443666458129883, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -9.772216796875, "rewards/ppl_reward/std": 9.582261085510254, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.29523226618766785, "step": 2783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 128.0625, "completions/mean_terminated_length": 128.0625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 3.394760889430399, "grad_norm": 2.7490339279174805, "kl": 4.9384765625, "learning_rate": 5.654588904837595e-06, "loss": 0.1946, "num_tokens": 49795761.0, "reward": -1.086669921875, "reward_std": 0.8849896788597107, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.90771484375, "rewards/ppl_reward/std": 2.938992738723755, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15570341050624847, "step": 2784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 141.953125, "completions/mean_terminated_length": 141.953125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 3.395979287237283, "grad_norm": 1.4934542179107666, "kl": 2.2705078125, "learning_rate": 5.6469225085452055e-06, "loss": 0.1112, "num_tokens": 49811750.0, "reward": -1.4400634765625, "reward_std": 0.36493349075317383, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.669189453125, "rewards/ppl_reward/std": 6.266151428222656, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 2785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 116.625, "completions/mean_terminated_length": 116.625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.397197685044167, "grad_norm": 1.7386518716812134, "kl": 2.5703125, "learning_rate": 5.639259267572638e-06, "loss": 0.0682, "num_tokens": 49825446.0, "reward": -0.468994140625, "reward_std": 0.32324421405792236, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.79736328125, "rewards/ppl_reward/std": 1.77435302734375, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 2786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 121.84375, "completions/mean_terminated_length": 121.84375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.398416082851051, "grad_norm": 1.7360156774520874, "kl": 2.775390625, "learning_rate": 5.63159918747457e-06, "loss": -0.0109, "num_tokens": 49839908.0, "reward": -1.1031494140625, "reward_std": 1.1933976411819458, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.878173828125, "rewards/ppl_reward/std": 4.2494049072265625, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.2291666716337204, "step": 2787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 132.328125, "completions/mean_terminated_length": 132.328125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.3996344806579346, "grad_norm": 1.5053356885910034, "kl": 2.671875, "learning_rate": 5.623942273803399e-06, "loss": 0.1375, "num_tokens": 49855425.0, "reward": -1.1268310546875, "reward_std": 0.6165285706520081, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.027099609375, "rewards/ppl_reward/std": 4.3152265548706055, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.19507674872875214, "step": 2788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 154.5625, "completions/mean_terminated_length": 140.7619171142578, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 3.4008528784648187, "grad_norm": 2.8250322341918945, "kl": 6.2646484375, "learning_rate": 5.616288532109225e-06, "loss": 0.3883, "num_tokens": 49873093.0, "reward": -3.4422607421875, "reward_std": 1.0524013042449951, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -10.603271484375, "rewards/ppl_reward/std": 7.207784175872803, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 2789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 138.703125, "completions/mean_terminated_length": 138.703125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.4020712762717027, "grad_norm": 1.9000706672668457, "kl": 3.806640625, "learning_rate": 5.608637967939848e-06, "loss": 0.181, "num_tokens": 49889394.0, "reward": 0.20745849609375, "reward_std": 0.596442699432373, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -3.2569580078125, "rewards/ppl_reward/std": 1.1394940614700317, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 125.265625, "completions/mean_terminated_length": 125.265625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.4032896740785867, "grad_norm": 1.647748351097107, "kl": 3.34765625, "learning_rate": 5.600990586840768e-06, "loss": 0.015, "num_tokens": 49903947.0, "reward": -0.9046630859375, "reward_std": 1.0475261211395264, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.520263671875, "rewards/ppl_reward/std": 3.1095075607299805, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1791718602180481, "step": 2791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 143.625, "completions/mean_terminated_length": 143.625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 3.4045080718854708, "grad_norm": 3.1983423233032227, "kl": 7.013671875, "learning_rate": 5.593346394355167e-06, "loss": 0.2544, "num_tokens": 49921443.0, "reward": -1.3411865234375, "reward_std": 0.9430695176124573, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.112060546875, "rewards/ppl_reward/std": 4.621097087860107, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2138771414756775, "step": 2792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 139.234375, "completions/mean_terminated_length": 139.234375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 3.4057264696923544, "grad_norm": 3.720613956451416, "kl": 5.5498046875, "learning_rate": 5.585705396023931e-06, "loss": 0.1578, "num_tokens": 49937738.0, "reward": -0.552978515625, "reward_std": 0.7799098491668701, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.78564453125, "rewards/ppl_reward/std": 2.51715087890625, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.20009763538837433, "step": 2793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 128.484375, "completions/mean_terminated_length": 128.484375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.4069448674992384, "grad_norm": 2.0253963470458984, "kl": 2.732421875, "learning_rate": 5.5780675973856135e-06, "loss": 0.1261, "num_tokens": 49952441.0, "reward": -2.030029296875, "reward_std": 0.4399009346961975, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.86474609375, "rewards/ppl_reward/std": 6.171125411987305, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09241779148578644, "step": 2794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 145.046875, "completions/mean_terminated_length": 145.046875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.4081632653061225, "grad_norm": 1.6105002164840698, "kl": 4.521484375, "learning_rate": 5.570433003976464e-06, "loss": 0.1146, "num_tokens": 49969684.0, "reward": -0.1820068359375, "reward_std": 0.7213600873947144, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -3.957763671875, "rewards/ppl_reward/std": 1.1398643255233765, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.20412415266036987, "step": 2795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 152.328125, "completions/mean_terminated_length": 152.328125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 3.4093816631130065, "grad_norm": 1.609332799911499, "kl": 3.60546875, "learning_rate": 5.562801621330402e-06, "loss": 0.1252, "num_tokens": 49987049.0, "reward": -1.3375244140625, "reward_std": 0.47596412897109985, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.393798828125, "rewards/ppl_reward/std": 2.9954071044921875, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 2796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 140.421875, "completions/mean_terminated_length": 126.39683532714844, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 3.41060006091989, "grad_norm": 1.852883219718933, "kl": 6.857421875, "learning_rate": 5.555173454979021e-06, "loss": 0.2887, "num_tokens": 50002844.0, "reward": -2.4354248046875, "reward_std": 1.2861480712890625, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.370849609375, "rewards/ppl_reward/std": 4.271894931793213, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.2763853967189789, "step": 2797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 142.203125, "completions/mean_terminated_length": 142.203125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.411818458726774, "grad_norm": 2.3640828132629395, "kl": 7.03125, "learning_rate": 5.547548510451588e-06, "loss": 0.3588, "num_tokens": 50018817.0, "reward": -2.4490966796875, "reward_std": 2.10556697845459, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.445068359375, "rewards/ppl_reward/std": 8.05451774597168, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.24077731370925903, "step": 2798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 125.046875, "completions/mean_terminated_length": 125.046875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 3.413036856533658, "grad_norm": 1.6847290992736816, "kl": 2.498046875, "learning_rate": 5.539926793275021e-06, "loss": 0.0073, "num_tokens": 50033532.0, "reward": -2.365478515625, "reward_std": 1.2417689561843872, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -8.52001953125, "rewards/ppl_reward/std": 6.066605091094971, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18662993609905243, "step": 2799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 128.8125, "completions/mean_terminated_length": 128.8125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 3.4142552543405422, "grad_norm": 2.408684730529785, "kl": 4.75, "learning_rate": 5.532308308973907e-06, "loss": 0.2179, "num_tokens": 50048968.0, "reward": -2.3131103515625, "reward_std": 0.540611982345581, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -8.360595703125, "rewards/ppl_reward/std": 3.8626489639282227, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 133.65625, "completions/mean_terminated_length": 133.65625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 3.4154736521474263, "grad_norm": 2.7403552532196045, "kl": 5.859375, "learning_rate": 5.524693063070492e-06, "loss": 0.3447, "num_tokens": 50064458.0, "reward": -1.214111328125, "reward_std": 1.3568531274795532, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.03759765625, "rewards/ppl_reward/std": 6.982995986938477, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18617255985736847, "step": 2801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 137.609375, "completions/mean_terminated_length": 137.609375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 3.4166920499543103, "grad_norm": 1.4826596975326538, "kl": 3.349609375, "learning_rate": 5.517081061084673e-06, "loss": 0.068, "num_tokens": 50080401.0, "reward": -1.913818359375, "reward_std": 0.6610670685768127, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.67138671875, "rewards/ppl_reward/std": 11.702398300170898, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 2802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 131.15625, "completions/mean_terminated_length": 131.15625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 3.417910447761194, "grad_norm": 1.6800507307052612, "kl": 2.619140625, "learning_rate": 5.509472308533994e-06, "loss": 0.0857, "num_tokens": 50095347.0, "reward": -3.34515380859375, "reward_std": 1.636138916015625, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -10.4637451171875, "rewards/ppl_reward/std": 13.275574684143066, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09241779148578644, "step": 2803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 132.84375, "completions/mean_terminated_length": 132.84375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 3.419128845568078, "grad_norm": 1.5506501197814941, "kl": 2.701171875, "learning_rate": 5.501866810933645e-06, "loss": 0.0225, "num_tokens": 50110393.0, "reward": -0.630615234375, "reward_std": 0.7871378660202026, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.94873046875, "rewards/ppl_reward/std": 2.2192556858062744, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.208927720785141, "step": 2804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 135.140625, "completions/mean_terminated_length": 135.140625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.420347243374962, "grad_norm": 4.109686374664307, "kl": 3.39453125, "learning_rate": 5.49426457379646e-06, "loss": 0.0464, "num_tokens": 50126002.0, "reward": -0.89453125, "reward_std": 0.4448586106300354, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.546875, "rewards/ppl_reward/std": 2.6094470024108887, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 2805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 132.125, "completions/mean_terminated_length": 132.125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 3.421565641181846, "grad_norm": 2.463880777359009, "kl": 3.138671875, "learning_rate": 5.486665602632899e-06, "loss": 0.0674, "num_tokens": 50141090.0, "reward": -1.595703125, "reward_std": 0.732124924659729, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.86328125, "rewards/ppl_reward/std": 4.690835952758789, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.21578919887542725, "step": 2806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 138.453125, "completions/mean_terminated_length": 138.453125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.4227840389887296, "grad_norm": 2.4294331073760986, "kl": 2.375, "learning_rate": 5.479069902951064e-06, "loss": 0.0626, "num_tokens": 50157327.0, "reward": -0.3651123046875, "reward_std": 0.5351249575614929, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -4.542724609375, "rewards/ppl_reward/std": 1.814449429512024, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 2807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 137.8125, "completions/mean_terminated_length": 137.8125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 3.4240024367956137, "grad_norm": 1.6050516366958618, "kl": 2.5126953125, "learning_rate": 5.47147748025669e-06, "loss": 0.0626, "num_tokens": 50173347.0, "reward": -1.246337890625, "reward_std": 0.43039146065711975, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.34423828125, "rewards/ppl_reward/std": 3.267192840576172, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 2808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 146.1875, "completions/mean_terminated_length": 146.1875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 3.4252208346024977, "grad_norm": 2.6034932136535645, "kl": 5.166015625, "learning_rate": 5.463888340053118e-06, "loss": 0.176, "num_tokens": 50189919.0, "reward": -1.220458984375, "reward_std": 1.255481481552124, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.94091796875, "rewards/ppl_reward/std": 3.599003791809082, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.265398770570755, "step": 2809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 130.03125, "completions/mean_terminated_length": 130.03125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.4264392324093818, "grad_norm": 1.9391367435455322, "kl": 6.16015625, "learning_rate": 5.45630248784133e-06, "loss": 0.2179, "num_tokens": 50205185.0, "reward": -2.563232421875, "reward_std": 1.008171796798706, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.72802734375, "rewards/ppl_reward/std": 6.134313106536865, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 297.171875, "completions/mean_terminated_length": 129.4423065185547, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 3.427657630216266, "grad_norm": 1.2058987617492676, "kl": 2.296875, "learning_rate": 5.448719929119916e-06, "loss": 0.1538, "num_tokens": 50230756.0, "reward": -0.80706787109375, "reward_std": 0.6737287640571594, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.467176616191864, "rewards/ppl_reward/mean": -4.8016357421875, "rewards/ppl_reward/std": 2.963438034057617, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 2811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 135.453125, "completions/mean_terminated_length": 135.453125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 3.4288760280231494, "grad_norm": 1.5115782022476196, "kl": 3.580078125, "learning_rate": 5.441140669385073e-06, "loss": 0.0869, "num_tokens": 50246457.0, "reward": -2.7496337890625, "reward_std": 0.974250078201294, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -9.132080078125, "rewards/ppl_reward/std": 8.435057640075684, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13768701255321503, "step": 2812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 123.8125, "completions/mean_terminated_length": 123.8125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.4300944258300334, "grad_norm": 1.3743064403533936, "kl": 4.3359375, "learning_rate": 5.433564714130617e-06, "loss": 0.1493, "num_tokens": 50260773.0, "reward": -1.6778564453125, "reward_std": 0.7499014139175415, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.113525390625, "rewards/ppl_reward/std": 3.3581066131591797, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 2813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 136.015625, "completions/mean_terminated_length": 136.015625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 3.4313128236369175, "grad_norm": 1.9385906457901, "kl": 5.123046875, "learning_rate": 5.425992068847965e-06, "loss": 0.1553, "num_tokens": 50276422.0, "reward": -1.8343505859375, "reward_std": 1.10595703125, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.239013671875, "rewards/ppl_reward/std": 6.071362495422363, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2212863266468048, "step": 2814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 148.703125, "completions/mean_terminated_length": 148.703125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.4325312214438015, "grad_norm": 1.922868251800537, "kl": 5.130859375, "learning_rate": 5.4184227390261344e-06, "loss": 0.2131, "num_tokens": 50293139.0, "reward": -0.8687744140625, "reward_std": 0.8864339590072632, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.276611328125, "rewards/ppl_reward/std": 2.6000797748565674, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.20740120112895966, "step": 2815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 142.515625, "completions/mean_terminated_length": 142.515625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.433749619250685, "grad_norm": 3.049480676651001, "kl": 3.509765625, "learning_rate": 5.410856730151736e-06, "loss": 0.1283, "num_tokens": 50309340.0, "reward": -4.784912109375, "reward_std": 2.42578125, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -13.26513671875, "rewards/ppl_reward/std": 18.064260482788086, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.2028672844171524, "step": 2816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 143.625, "completions/mean_terminated_length": 143.625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 3.434968017057569, "grad_norm": 2.4062001705169678, "kl": 4.265625, "learning_rate": 5.403294047708987e-06, "loss": 0.1585, "num_tokens": 50325532.0, "reward": -1.6551513671875, "reward_std": 0.9916486144065857, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.052490234375, "rewards/ppl_reward/std": 3.4068238735198975, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 2817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 129.859375, "completions/mean_terminated_length": 129.859375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.436186414864453, "grad_norm": 2.3186683654785156, "kl": 4.05859375, "learning_rate": 5.395734697179673e-06, "loss": 0.0763, "num_tokens": 50340819.0, "reward": -1.32470703125, "reward_std": 0.3229046165943146, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.5009765625, "rewards/ppl_reward/std": 2.4533278942108154, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 2818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 129.109375, "completions/mean_terminated_length": 129.109375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 3.4374048126713372, "grad_norm": 1.87411367893219, "kl": 3.98046875, "learning_rate": 5.388178684043175e-06, "loss": 0.1256, "num_tokens": 50355410.0, "reward": -1.0341796875, "reward_std": 0.8278883099555969, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.708984375, "rewards/ppl_reward/std": 4.599308490753174, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 2819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 148.859375, "completions/mean_terminated_length": 148.859375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 3.4386232104782213, "grad_norm": 1.6781593561172485, "kl": 4.98828125, "learning_rate": 5.380626013776456e-06, "loss": 0.1744, "num_tokens": 50372457.0, "reward": -0.863525390625, "reward_std": 0.7803138494491577, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.32861328125, "rewards/ppl_reward/std": 3.3033976554870605, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.15900352597236633, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 147.609375, "completions/mean_terminated_length": 147.609375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 3.4398416082851053, "grad_norm": 3.5220675468444824, "kl": 3.888671875, "learning_rate": 5.373076691854054e-06, "loss": 0.1708, "num_tokens": 50389832.0, "reward": -0.93768310546875, "reward_std": 0.6035960912704468, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.6800537109375, "rewards/ppl_reward/std": 2.2229549884796143, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 2821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 133.953125, "completions/mean_terminated_length": 133.953125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 3.441060006091989, "grad_norm": 1.6474696397781372, "kl": 3.4423828125, "learning_rate": 5.365530723748083e-06, "loss": 0.0197, "num_tokens": 50405285.0, "reward": -0.39599609375, "reward_std": 0.889252781867981, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.4091796875, "rewards/ppl_reward/std": 1.9535682201385498, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 2822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 133.96875, "completions/mean_terminated_length": 133.96875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 3.442278403898873, "grad_norm": 1.5270640850067139, "kl": 1.408203125, "learning_rate": 5.357988114928221e-06, "loss": 0.0495, "num_tokens": 50420459.0, "reward": -4.6290283203125, "reward_std": 0.855014443397522, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -13.078369140625, "rewards/ppl_reward/std": 22.46055793762207, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1269075721502304, "step": 2823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 3.443496801705757, "grad_norm": 1.5597401857376099, "kl": 3.3642578125, "learning_rate": 5.350448870861719e-06, "loss": 0.1704, "num_tokens": 50436587.0, "reward": 0.09600830078125, "reward_std": 0.31527793407440186, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -3.6048583984375, "rewards/ppl_reward/std": 1.3144272565841675, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 2824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 134.96875, "completions/mean_terminated_length": 134.96875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 3.444715199512641, "grad_norm": 1.8807697296142578, "kl": 3.3212890625, "learning_rate": 5.342912997013374e-06, "loss": 0.1303, "num_tokens": 50451737.0, "reward": -0.543212890625, "reward_std": 0.5703001618385315, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.76611328125, "rewards/ppl_reward/std": 2.1717135906219482, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1677328199148178, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 129.09375, "completions/mean_terminated_length": 129.09375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 3.4459335973195246, "grad_norm": 1.9098485708236694, "kl": 2.76171875, "learning_rate": 5.335380498845559e-06, "loss": 0.0464, "num_tokens": 50467127.0, "reward": -0.53289794921875, "reward_std": 0.835723340511322, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.7532958984375, "rewards/ppl_reward/std": 3.6798105239868164, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.2136233150959015, "step": 2826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 135.296875, "completions/mean_terminated_length": 135.296875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.4471519951264087, "grad_norm": 1.5962252616882324, "kl": 3.48046875, "learning_rate": 5.327851381818183e-06, "loss": 0.0163, "num_tokens": 50482578.0, "reward": -1.8927001953125, "reward_std": 1.2431292533874512, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.402587890625, "rewards/ppl_reward/std": 5.424143314361572, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.23249077796936035, "step": 2827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 138.578125, "completions/mean_terminated_length": 138.578125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 3.4483703929332927, "grad_norm": 2.192378044128418, "kl": 4.2548828125, "learning_rate": 5.320325651388716e-06, "loss": 0.1336, "num_tokens": 50498007.0, "reward": -0.35552978515625, "reward_std": 0.8436753749847412, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.4063720703125, "rewards/ppl_reward/std": 2.775665044784546, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 2828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.4495887907401768, "grad_norm": 1.8356338739395142, "kl": 6.09375, "learning_rate": 5.312803313012167e-06, "loss": 0.2191, "num_tokens": 50514679.0, "reward": -1.5816650390625, "reward_std": 0.9826582670211792, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.725830078125, "rewards/ppl_reward/std": 5.42454719543457, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19920477271080017, "step": 2829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 142.953125, "completions/mean_terminated_length": 142.953125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.450807188547061, "grad_norm": 2.3893215656280518, "kl": 3.6015625, "learning_rate": 5.305284372141095e-06, "loss": 0.11, "num_tokens": 50530220.0, "reward": -0.51702880859375, "reward_std": 0.5028001070022583, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.8387451171875, "rewards/ppl_reward/std": 2.5782265663146973, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 135.59375, "completions/mean_terminated_length": 135.59375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.4520255863539444, "grad_norm": 3.192803144454956, "kl": 4.28515625, "learning_rate": 5.297768834225581e-06, "loss": 0.0936, "num_tokens": 50545842.0, "reward": -0.8367919921875, "reward_std": 0.6787523031234741, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.392333984375, "rewards/ppl_reward/std": 1.9734059572219849, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 2831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 143.796875, "completions/mean_terminated_length": 143.796875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 3.4532439841608285, "grad_norm": 1.910582423210144, "kl": 2.296875, "learning_rate": 5.2902567047132505e-06, "loss": 0.0983, "num_tokens": 50562125.0, "reward": -2.1192626953125, "reward_std": 0.8219627141952515, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -8.066650390625, "rewards/ppl_reward/std": 10.618607521057129, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.17938801646232605, "step": 2832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 139.046875, "completions/mean_terminated_length": 139.046875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 3.4544623819677125, "grad_norm": 1.5342482328414917, "kl": 1.8701171875, "learning_rate": 5.282747989049258e-06, "loss": 0.0646, "num_tokens": 50577248.0, "reward": -1.5238037109375, "reward_std": 0.3012295365333557, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": -7.047607421875, "rewards/ppl_reward/std": 4.174289703369141, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 129.46875, "completions/mean_terminated_length": 129.46875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 3.4556807797745965, "grad_norm": 2.134606122970581, "kl": 3.3125, "learning_rate": 5.2752426926762815e-06, "loss": 0.0616, "num_tokens": 50592110.0, "reward": -2.6590576171875, "reward_std": 1.9650003910064697, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.927490234375, "rewards/ppl_reward/std": 9.071205139160156, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 2834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 138.28125, "completions/mean_terminated_length": 138.28125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.45689917758148, "grad_norm": 1.5576900243759155, "kl": 5.83203125, "learning_rate": 5.2677408210345235e-06, "loss": 0.1411, "num_tokens": 50608384.0, "reward": -0.917724609375, "reward_std": 1.3714381456375122, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.28076171875, "rewards/ppl_reward/std": 2.6217103004455566, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.24947242438793182, "step": 2835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 138.03125, "completions/mean_terminated_length": 138.03125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 3.458117575388364, "grad_norm": 1.3500314950942993, "kl": 3.017578125, "learning_rate": 5.260242379561693e-06, "loss": 0.0484, "num_tokens": 50624202.0, "reward": -0.77880859375, "reward_std": 0.5331250429153442, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.2998046875, "rewards/ppl_reward/std": 3.1507322788238525, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 2836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 148.671875, "completions/mean_terminated_length": 148.671875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.459335973195248, "grad_norm": 1.82974112033844, "kl": 3.1796875, "learning_rate": 5.252747373693031e-06, "loss": 0.1619, "num_tokens": 50640885.0, "reward": -2.881103515625, "reward_std": 3.1181085109710693, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -9.52783203125, "rewards/ppl_reward/std": 18.662687301635742, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 2837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 130.53125, "completions/mean_terminated_length": 130.53125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 3.4605543710021323, "grad_norm": 1.7157008647918701, "kl": 5.896484375, "learning_rate": 5.245255808861268e-06, "loss": 0.1538, "num_tokens": 50656471.0, "reward": -2.48004150390625, "reward_std": 1.3580896854400635, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -8.4678955078125, "rewards/ppl_reward/std": 9.424605369567871, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2121305763721466, "step": 2838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 153.78125, "completions/mean_terminated_length": 139.96826171875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.4617727688090163, "grad_norm": 1.5712764263153076, "kl": 2.59765625, "learning_rate": 5.237767690496656e-06, "loss": 0.1175, "num_tokens": 50673081.0, "reward": -1.16455078125, "reward_std": 0.5828464031219482, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.0166015625, "rewards/ppl_reward/std": 2.4908950328826904, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 2839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 131.140625, "completions/mean_terminated_length": 131.140625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 3.4629911666159003, "grad_norm": 1.7907956838607788, "kl": 5.03515625, "learning_rate": 5.230283024026941e-06, "loss": 0.0621, "num_tokens": 50687978.0, "reward": -3.95562744140625, "reward_std": 5.058187007904053, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -11.3018798828125, "rewards/ppl_reward/std": 25.175033569335938, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.250866562128067, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 148.71875, "completions/mean_terminated_length": 148.71875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.464209564422784, "grad_norm": 1.4058064222335815, "kl": 2.73828125, "learning_rate": 5.22280181487737e-06, "loss": 0.1143, "num_tokens": 50704392.0, "reward": -0.3748779296875, "reward_std": 0.5362101197242737, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.515380859375, "rewards/ppl_reward/std": 1.5927612781524658, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 2841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 133.203125, "completions/mean_terminated_length": 133.203125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.465427962229668, "grad_norm": 1.6579325199127197, "kl": 5.78125, "learning_rate": 5.215324068470687e-06, "loss": 0.1426, "num_tokens": 50719341.0, "reward": -2.1243896484375, "reward_std": 1.3875031471252441, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -7.553466796875, "rewards/ppl_reward/std": 6.851865768432617, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.2516993284225464, "step": 2842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 131.9375, "completions/mean_terminated_length": 131.9375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 3.466646360036552, "grad_norm": 1.7430086135864258, "kl": 4.263671875, "learning_rate": 5.207849790227115e-06, "loss": 0.0897, "num_tokens": 50734593.0, "reward": -7.3099365234375, "reward_std": 1.922992467880249, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -18.283935546875, "rewards/ppl_reward/std": 18.250059127807617, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 2843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 144.671875, "completions/mean_terminated_length": 144.671875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 3.467864757843436, "grad_norm": 1.7978266477584839, "kl": 4.525390625, "learning_rate": 5.200378985564376e-06, "loss": 0.1628, "num_tokens": 50750732.0, "reward": -0.5672607421875, "reward_std": 0.6290832757949829, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.884521484375, "rewards/ppl_reward/std": 1.8889273405075073, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 2844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 134.0625, "completions/mean_terminated_length": 134.0625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.4690831556503197, "grad_norm": 1.821824550628662, "kl": 3.494140625, "learning_rate": 5.192911659897663e-06, "loss": 0.1077, "num_tokens": 50766432.0, "reward": -1.333984375, "reward_std": 1.0181524753570557, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.40234375, "rewards/ppl_reward/std": 5.387070178985596, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16194961965084076, "step": 2845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 139.015625, "completions/mean_terminated_length": 139.015625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 3.4703015534572037, "grad_norm": 1.326729655265808, "kl": 4.083984375, "learning_rate": 5.185447818639655e-06, "loss": 0.1012, "num_tokens": 50782377.0, "reward": -4.392333984375, "reward_std": 2.6175405979156494, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -12.36279296875, "rewards/ppl_reward/std": 22.30782127380371, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2605654299259186, "step": 2846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 135.03125, "completions/mean_terminated_length": 135.03125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.4715199512640877, "grad_norm": 1.7049791812896729, "kl": 4.2578125, "learning_rate": 5.1779874672005035e-06, "loss": 0.125, "num_tokens": 50798011.0, "reward": -0.8624267578125, "reward_std": 0.49090343713760376, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.490478515625, "rewards/ppl_reward/std": 2.595026731491089, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 2847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 136.65625, "completions/mean_terminated_length": 136.65625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 3.472738349070972, "grad_norm": 2.4092283248901367, "kl": 5.20703125, "learning_rate": 5.17053061098783e-06, "loss": 0.1532, "num_tokens": 50813981.0, "reward": -2.8818359375, "reward_std": 1.9340343475341797, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -9.326171875, "rewards/ppl_reward/std": 8.772817611694336, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 2848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 131.125, "completions/mean_terminated_length": 131.125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 3.473956746877856, "grad_norm": 1.900071144104004, "kl": 3.4921875, "learning_rate": 5.163077255406724e-06, "loss": 0.0818, "num_tokens": 50829117.0, "reward": -0.68646240234375, "reward_std": 1.3519539833068848, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.1151123046875, "rewards/ppl_reward/std": 6.3174028396606445, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 2849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 138.84375, "completions/mean_terminated_length": 138.84375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 3.4751751446847394, "grad_norm": 1.545600175857544, "kl": 3.4609375, "learning_rate": 5.155627405859731e-06, "loss": 0.0594, "num_tokens": 50845291.0, "reward": -0.5897216796875, "reward_std": 1.2147839069366455, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.835693359375, "rewards/ppl_reward/std": 4.450942516326904, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.20412415266036987, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 127.265625, "completions/mean_terminated_length": 127.265625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.4763935424916235, "grad_norm": 2.5073044300079346, "kl": 4.794921875, "learning_rate": 5.148181067746862e-06, "loss": 0.1826, "num_tokens": 50859684.0, "reward": -2.478515625, "reward_std": 0.7718355655670166, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -8.56640625, "rewards/ppl_reward/std": 12.35351276397705, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16347388923168182, "step": 2851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 143.625, "completions/mean_terminated_length": 143.625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 3.4776119402985075, "grad_norm": 1.4880130290985107, "kl": 3.232421875, "learning_rate": 5.1407382464655875e-06, "loss": 0.0512, "num_tokens": 50875948.0, "reward": -8.67041015625, "reward_std": 2.6464734077453613, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -21.0830078125, "rewards/ppl_reward/std": 25.301576614379883, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 2852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 143.390625, "completions/mean_terminated_length": 143.390625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 3.4788303381053916, "grad_norm": 1.6137619018554688, "kl": 4.73828125, "learning_rate": 5.133298947410812e-06, "loss": 0.0799, "num_tokens": 50892157.0, "reward": -2.9244384765625, "reward_std": 1.2750418186187744, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -9.395751953125, "rewards/ppl_reward/std": 11.586177825927734, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16347388923168182, "step": 2853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 148.4375, "completions/mean_terminated_length": 148.4375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.480048735912275, "grad_norm": 2.042492151260376, "kl": 6.40625, "learning_rate": 5.1258631759749e-06, "loss": 0.209, "num_tokens": 50908409.0, "reward": -1.6409912109375, "reward_std": 1.0333341360092163, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -6.742919921875, "rewards/ppl_reward/std": 3.9878997802734375, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 2854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 136.578125, "completions/mean_terminated_length": 136.578125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 3.481267133719159, "grad_norm": 1.5284608602523804, "kl": 2.4501953125, "learning_rate": 5.118430937547658e-06, "loss": 0.0709, "num_tokens": 50923950.0, "reward": 0.28912353515625, "reward_std": 0.25551971793174744, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -3.3123779296875, "rewards/ppl_reward/std": 1.4565935134887695, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 2855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 135.859375, "completions/mean_terminated_length": 135.859375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 3.4824855315260432, "grad_norm": 1.7803891897201538, "kl": 4.435546875, "learning_rate": 5.111002237516334e-06, "loss": 0.1211, "num_tokens": 50939597.0, "reward": -3.265869140625, "reward_std": 2.362272024154663, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -10.14892578125, "rewards/ppl_reward/std": 10.255789756774902, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 2856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 148.609375, "completions/mean_terminated_length": 148.609375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.4837039293329273, "grad_norm": 1.9110571146011353, "kl": 2.974609375, "learning_rate": 5.103577081265596e-06, "loss": 0.1265, "num_tokens": 50956684.0, "reward": -2.03106689453125, "reward_std": 0.41099634766578674, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.9605712890625, "rewards/ppl_reward/std": 7.749204158782959, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 2857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 131.703125, "completions/mean_terminated_length": 131.703125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 3.4849223271398113, "grad_norm": 1.9564671516418457, "kl": 4.39453125, "learning_rate": 5.096155474177562e-06, "loss": 0.2118, "num_tokens": 50971617.0, "reward": -1.83935546875, "reward_std": 0.5536797046661377, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.3115234375, "rewards/ppl_reward/std": 4.826457977294922, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1224314495921135, "step": 2858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 143.984375, "completions/mean_terminated_length": 143.984375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 3.486140724946695, "grad_norm": 1.9466893672943115, "kl": 4.6484375, "learning_rate": 5.088737421631767e-06, "loss": 0.2382, "num_tokens": 50987936.0, "reward": -1.507568359375, "reward_std": 1.915941596031189, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.67138671875, "rewards/ppl_reward/std": 8.055763244628906, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 2859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 128.1875, "completions/mean_terminated_length": 128.1875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.487359122753579, "grad_norm": 1.6446024179458618, "kl": 3.796875, "learning_rate": 5.081322929005173e-06, "loss": 0.1152, "num_tokens": 51002124.0, "reward": -2.1103515625, "reward_std": 0.6630904674530029, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.970703125, "rewards/ppl_reward/std": 5.954057693481445, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 128.109375, "completions/mean_terminated_length": 128.109375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.488577520560463, "grad_norm": 2.41497540473938, "kl": 2.486328125, "learning_rate": 5.073912001672165e-06, "loss": -0.0307, "num_tokens": 51016747.0, "reward": -1.954345703125, "reward_std": 1.171633243560791, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -7.72900390625, "rewards/ppl_reward/std": 8.578930854797363, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 2861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 144.125, "completions/mean_terminated_length": 144.125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 3.489795918367347, "grad_norm": 1.9392319917678833, "kl": 4.4462890625, "learning_rate": 5.0665046450045345e-06, "loss": 0.0931, "num_tokens": 51033211.0, "reward": -0.22802734375, "reward_std": 0.5015839338302612, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.0654296875, "rewards/ppl_reward/std": 3.4304254055023193, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.22479599714279175, "step": 2862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 139.171875, "completions/mean_terminated_length": 139.171875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.491014316174231, "grad_norm": 2.424678325653076, "kl": 4.4365234375, "learning_rate": 5.059100864371486e-06, "loss": 0.1157, "num_tokens": 51048710.0, "reward": -1.86260986328125, "reward_std": 0.9171609282493591, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.3892822265625, "rewards/ppl_reward/std": 4.226316928863525, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 2863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 159.046875, "completions/mean_terminated_length": 159.046875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 3.4922327139811147, "grad_norm": 1.9622929096221924, "kl": 6.263671875, "learning_rate": 5.05170066513964e-06, "loss": 0.3005, "num_tokens": 51066665.0, "reward": -1.193359375, "reward_std": 0.5300992131233215, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.07421875, "rewards/ppl_reward/std": 4.237447738647461, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 2864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 145.015625, "completions/mean_terminated_length": 145.015625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.4934511117879987, "grad_norm": 1.8142292499542236, "kl": 4.375, "learning_rate": 5.044304052673015e-06, "loss": 0.1674, "num_tokens": 51083042.0, "reward": -1.0076904296875, "reward_std": 0.5475824475288391, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.601318359375, "rewards/ppl_reward/std": 1.879316806793213, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1489359587430954, "step": 2865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 146.921875, "completions/mean_terminated_length": 146.921875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 3.4946695095948828, "grad_norm": 1.882758378982544, "kl": 5.00390625, "learning_rate": 5.036911032333034e-06, "loss": 0.2321, "num_tokens": 51099245.0, "reward": -0.609375, "reward_std": 0.9939824342727661, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.8046875, "rewards/ppl_reward/std": 3.1519463062286377, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2237938940525055, "step": 2866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 140.1875, "completions/mean_terminated_length": 140.1875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.495887907401767, "grad_norm": 1.4884321689605713, "kl": 4.216796875, "learning_rate": 5.029521609478512e-06, "loss": 0.1439, "num_tokens": 51115161.0, "reward": -0.715087890625, "reward_std": 0.46670621633529663, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.16455078125, "rewards/ppl_reward/std": 2.0117037296295166, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07344620674848557, "step": 2867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 142.796875, "completions/mean_terminated_length": 142.796875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 3.497106305208651, "grad_norm": 1.62375009059906, "kl": 3.0703125, "learning_rate": 5.0221357894656605e-06, "loss": 0.0443, "num_tokens": 51131020.0, "reward": -1.6434326171875, "reward_std": 0.6618773937225342, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.013427734375, "rewards/ppl_reward/std": 3.9121193885803223, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0903826504945755, "step": 2868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 134.40625, "completions/mean_terminated_length": 134.40625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 3.4983247030155344, "grad_norm": 1.6101492643356323, "kl": 4.46484375, "learning_rate": 5.014753577648073e-06, "loss": 0.0951, "num_tokens": 51146118.0, "reward": -1.0064697265625, "reward_std": 0.6808719635009766, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.528564453125, "rewards/ppl_reward/std": 2.1111233234405518, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20638974010944366, "step": 2869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 142.484375, "completions/mean_terminated_length": 142.484375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.4995431008224185, "grad_norm": 1.917365550994873, "kl": 3.5283203125, "learning_rate": 5.007374979376734e-06, "loss": 0.1241, "num_tokens": 51161965.0, "reward": -0.5362548828125, "reward_std": 0.4353013038635254, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.830322265625, "rewards/ppl_reward/std": 1.638285756111145, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 154.390625, "completions/mean_terminated_length": 154.390625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 3.5007614986293025, "grad_norm": 1.7632806301116943, "kl": 5.84765625, "learning_rate": 5.000000000000003e-06, "loss": 0.2072, "num_tokens": 51179070.0, "reward": -0.7069091796875, "reward_std": 0.5854167938232422, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.898193359375, "rewards/ppl_reward/std": 2.5853288173675537, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 2871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 133.828125, "completions/mean_terminated_length": 133.828125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 3.5019798964361866, "grad_norm": 3.228792905807495, "kl": 6.0546875, "learning_rate": 4.992628644863621e-06, "loss": 0.1528, "num_tokens": 51194107.0, "reward": -1.432373046875, "reward_std": 1.0255831480026245, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.52880859375, "rewards/ppl_reward/std": 6.284112453460693, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14471296966075897, "step": 2872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 151.3125, "completions/mean_terminated_length": 151.3125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 3.50319829424307, "grad_norm": 1.5842958688735962, "kl": 1.46484375, "learning_rate": 4.985260919310701e-06, "loss": 0.1013, "num_tokens": 51211135.0, "reward": -0.068603515625, "reward_std": 0.1721685826778412, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -4.08251953125, "rewards/ppl_reward/std": 1.1791539192199707, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 2873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 157.84375, "completions/mean_terminated_length": 157.84375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.504416692049954, "grad_norm": 1.9759019613265991, "kl": 4.88671875, "learning_rate": 4.9778968286817245e-06, "loss": 0.1409, "num_tokens": 51228557.0, "reward": -0.5863037109375, "reward_std": 0.5605988502502441, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -4.805419921875, "rewards/ppl_reward/std": 2.33225679397583, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 2874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 164.578125, "completions/mean_terminated_length": 164.578125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 3.5056350898568382, "grad_norm": 2.572657585144043, "kl": 5.171875, "learning_rate": 4.970536378314543e-06, "loss": 0.3164, "num_tokens": 51246978.0, "reward": -0.7569580078125, "reward_std": 0.4325578212738037, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.318603515625, "rewards/ppl_reward/std": 1.6209112405776978, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 132.578125, "completions/mean_terminated_length": 132.578125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.5068534876637223, "grad_norm": 1.539461374282837, "kl": 2.32421875, "learning_rate": 4.963179573544356e-06, "loss": 0.0358, "num_tokens": 51262511.0, "reward": -1.00732421875, "reward_std": 0.38164469599723816, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.8037109375, "rewards/ppl_reward/std": 5.267867565155029, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 2876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 3.5080718854706063, "grad_norm": 2.4023146629333496, "kl": 3.994140625, "learning_rate": 4.955826419703736e-06, "loss": 0.1021, "num_tokens": 51279023.0, "reward": -4.1463623046875, "reward_std": 3.3862130641937256, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -11.847412109375, "rewards/ppl_reward/std": 18.685094833374023, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2237938940525055, "step": 2877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 151.21875, "completions/mean_terminated_length": 151.21875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 3.5092902832774904, "grad_norm": 3.019951581954956, "kl": 6.53125, "learning_rate": 4.948476922122597e-06, "loss": 0.2485, "num_tokens": 51295437.0, "reward": -1.877197265625, "reward_std": 0.7822358012199402, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.30908203125, "rewards/ppl_reward/std": 7.742797374725342, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2052978277206421, "step": 2878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 156.40625, "completions/mean_terminated_length": 156.40625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.510508681084374, "grad_norm": 1.77857506275177, "kl": 4.11328125, "learning_rate": 4.9411310861282166e-06, "loss": 0.153, "num_tokens": 51312799.0, "reward": -1.8875732421875, "reward_std": 1.6547757387161255, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.439208984375, "rewards/ppl_reward/std": 8.745503425598145, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1979166716337204, "step": 2879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 160.75, "completions/mean_terminated_length": 160.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 3.511727078891258, "grad_norm": 2.484611749649048, "kl": 7.46875, "learning_rate": 4.933788917045201e-06, "loss": 0.4611, "num_tokens": 51330391.0, "reward": -1.611328125, "reward_std": 0.757888674736023, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.81640625, "rewards/ppl_reward/std": 2.658322334289551, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 166.109375, "completions/mean_terminated_length": 166.109375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 3.512945476698142, "grad_norm": 4.013733386993408, "kl": 5.09765625, "learning_rate": 4.926450420195513e-06, "loss": 0.4158, "num_tokens": 51348086.0, "reward": -0.6357421875, "reward_std": 0.4138127565383911, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.068359375, "rewards/ppl_reward/std": 2.4938952922821045, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11545931547880173, "step": 2881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 165.84375, "completions/mean_terminated_length": 165.84375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 3.5141638745050257, "grad_norm": 1.779194712638855, "kl": 6.91015625, "learning_rate": 4.9191156008984435e-06, "loss": 0.3977, "num_tokens": 51366124.0, "reward": -0.3526611328125, "reward_std": 0.7643395066261292, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.189697265625, "rewards/ppl_reward/std": 2.2824225425720215, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.18617255985736847, "step": 2882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 145.546875, "completions/mean_terminated_length": 145.546875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.5153822723119097, "grad_norm": 2.9922866821289062, "kl": 5.041015625, "learning_rate": 4.911784464470624e-06, "loss": 0.1623, "num_tokens": 51381951.0, "reward": -2.57861328125, "reward_std": 1.6819130182266235, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -8.6728515625, "rewards/ppl_reward/std": 10.062983512878418, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1508491188287735, "step": 2883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 145.015625, "completions/mean_terminated_length": 145.015625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 3.5166006701187937, "grad_norm": 1.8454909324645996, "kl": 4.107421875, "learning_rate": 4.904457016226014e-06, "loss": 0.1848, "num_tokens": 51398896.0, "reward": -1.71624755859375, "reward_std": 0.5551002621650696, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.1824951171875, "rewards/ppl_reward/std": 4.979846477508545, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 2884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 136.1875, "completions/mean_terminated_length": 136.1875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 3.517819067925678, "grad_norm": 1.8513263463974, "kl": 3.740234375, "learning_rate": 4.897133261475901e-06, "loss": 0.1016, "num_tokens": 51414172.0, "reward": -2.1046142578125, "reward_std": 2.2391257286071777, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.873291015625, "rewards/ppl_reward/std": 9.733656883239746, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1979166716337204, "step": 2885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 151.65625, "completions/mean_terminated_length": 151.65625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 3.519037465732562, "grad_norm": 1.8734426498413086, "kl": 4.1630859375, "learning_rate": 4.889813205528895e-06, "loss": 0.1651, "num_tokens": 51430846.0, "reward": -1.1571044921875, "reward_std": 0.766052782535553, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.978271484375, "rewards/ppl_reward/std": 3.4584193229675293, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16399458050727844, "step": 2886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 150.359375, "completions/mean_terminated_length": 150.359375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 3.520255863539446, "grad_norm": 1.2891098260879517, "kl": 3.7333984375, "learning_rate": 4.882496853690927e-06, "loss": 0.1235, "num_tokens": 51448085.0, "reward": -0.3681640625, "reward_std": 0.5508573651313782, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.462890625, "rewards/ppl_reward/std": 2.4119725227355957, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 2887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 146.8125, "completions/mean_terminated_length": 146.8125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 3.5214742613463295, "grad_norm": 1.6909990310668945, "kl": 5.13671875, "learning_rate": 4.875184211265238e-06, "loss": 0.1469, "num_tokens": 51464057.0, "reward": -1.0855712890625, "reward_std": 0.9136976003646851, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.616455078125, "rewards/ppl_reward/std": 2.5341646671295166, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.23302355408668518, "step": 2888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 145.953125, "completions/mean_terminated_length": 145.953125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 3.5226926591532135, "grad_norm": 2.630113363265991, "kl": 4.3359375, "learning_rate": 4.867875283552378e-06, "loss": 0.1494, "num_tokens": 51480582.0, "reward": -1.27734375, "reward_std": 1.243837594985962, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.1875, "rewards/ppl_reward/std": 3.7534914016723633, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 2889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 139.28125, "completions/mean_terminated_length": 139.28125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 3.5239110569600975, "grad_norm": 2.1946682929992676, "kl": 3.6806640625, "learning_rate": 4.860570075850214e-06, "loss": 0.0772, "num_tokens": 51496208.0, "reward": -0.8587646484375, "reward_std": 1.0291799306869507, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.428466796875, "rewards/ppl_reward/std": 4.871912479400635, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.20009763538837433, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 140.1875, "completions/mean_terminated_length": 140.1875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 3.5251294547669816, "grad_norm": 2.3200185298919678, "kl": 6.3515625, "learning_rate": 4.85326859345391e-06, "loss": 0.3313, "num_tokens": 51512164.0, "reward": -0.2711181640625, "reward_std": 0.8613204956054688, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.065673828125, "rewards/ppl_reward/std": 1.767395257949829, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1677328199148178, "step": 2891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 156.265625, "completions/mean_terminated_length": 156.265625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 3.526347852573865, "grad_norm": 2.8956799507141113, "kl": 6.7734375, "learning_rate": 4.845970841655933e-06, "loss": 0.2978, "num_tokens": 51528781.0, "reward": -3.477294921875, "reward_std": 3.9934349060058594, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -10.37646484375, "rewards/ppl_reward/std": 22.91991424560547, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.19527530670166016, "step": 2892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 142.734375, "completions/mean_terminated_length": 142.734375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 3.5275662503807492, "grad_norm": 1.8043580055236816, "kl": 3.494140625, "learning_rate": 4.838676825746049e-06, "loss": 0.1105, "num_tokens": 51545236.0, "reward": -0.118408203125, "reward_std": 0.46289610862731934, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -3.86962890625, "rewards/ppl_reward/std": 1.9273512363433838, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13768701255321503, "step": 2893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 147.296875, "completions/mean_terminated_length": 147.296875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 3.5287846481876333, "grad_norm": 1.6382769346237183, "kl": 6.1171875, "learning_rate": 4.831386551011301e-06, "loss": 0.2221, "num_tokens": 51561399.0, "reward": -1.7392578125, "reward_std": 0.8005207777023315, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -6.892578125, "rewards/ppl_reward/std": 3.2957582473754883, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1706644892692566, "step": 2894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 143.25, "completions/mean_terminated_length": 143.25, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 3.5300030459945173, "grad_norm": 2.4800004959106445, "kl": 5.08984375, "learning_rate": 4.824100022736036e-06, "loss": 0.1685, "num_tokens": 51577599.0, "reward": -0.86279296875, "reward_std": 0.49573057889938354, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.4677734375, "rewards/ppl_reward/std": 2.36007022857666, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10789459943771362, "step": 2895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 141.375, "completions/mean_terminated_length": 141.375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 3.5312214438014013, "grad_norm": 1.8491826057434082, "kl": 3.41015625, "learning_rate": 4.81681724620188e-06, "loss": 0.1498, "num_tokens": 51593319.0, "reward": -4.1708984375, "reward_std": 0.7530395984649658, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -12.115234375, "rewards/ppl_reward/std": 12.97789192199707, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 2896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 139.96875, "completions/mean_terminated_length": 139.96875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 3.532439841608285, "grad_norm": 1.3985860347747803, "kl": 3.54296875, "learning_rate": 4.8095382266877414e-06, "loss": 0.0416, "num_tokens": 51608605.0, "reward": -1.12109375, "reward_std": 1.6190580129623413, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.8828125, "rewards/ppl_reward/std": 4.740097999572754, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1751912236213684, "step": 2897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 130.234375, "completions/mean_terminated_length": 130.234375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 3.533658239415169, "grad_norm": 2.651355028152466, "kl": 7.2109375, "learning_rate": 4.802262969469799e-06, "loss": 0.179, "num_tokens": 51623764.0, "reward": -2.0408935546875, "reward_std": 1.0614502429962158, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -7.363037109375, "rewards/ppl_reward/std": 3.045328378677368, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2519763112068176, "step": 2898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 3.534876637222053, "grad_norm": 2.1492884159088135, "kl": 6.1767578125, "learning_rate": 4.79499147982151e-06, "loss": 0.214, "num_tokens": 51639716.0, "reward": -1.9288330078125, "reward_std": 0.6487541198730469, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.451416015625, "rewards/ppl_reward/std": 4.80668830871582, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 2899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 141.84375, "completions/mean_terminated_length": 141.84375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.536095035028937, "grad_norm": 2.0871317386627197, "kl": 4.59375, "learning_rate": 4.787723763013606e-06, "loss": 0.1382, "num_tokens": 51655514.0, "reward": -3.43707275390625, "reward_std": 0.8552778363227844, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -10.5382080078125, "rewards/ppl_reward/std": 16.115821838378906, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 137.015625, "completions/mean_terminated_length": 137.015625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 3.5373134328358207, "grad_norm": 2.3279542922973633, "kl": 7.01953125, "learning_rate": 4.7804598243140664e-06, "loss": 0.2448, "num_tokens": 51671379.0, "reward": -1.1083984375, "reward_std": 1.3305084705352783, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "rewards/ppl_reward/mean": -5.521484375, "rewards/ppl_reward/std": 4.266885757446289, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.1994769275188446, "step": 2901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 148.515625, "completions/mean_terminated_length": 148.515625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 3.5385318306427047, "grad_norm": 2.0152738094329834, "kl": 3.9970703125, "learning_rate": 4.773199668988151e-06, "loss": 0.2031, "num_tokens": 51688652.0, "reward": -0.32733154296875, "reward_std": 0.4904007911682129, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.3421630859375, "rewards/ppl_reward/std": 1.7839508056640625, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 2902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 135.28125, "completions/mean_terminated_length": 135.28125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 3.5397502284495888, "grad_norm": 1.8915199041366577, "kl": 4.12109375, "learning_rate": 4.765943302298367e-06, "loss": 0.2313, "num_tokens": 51704174.0, "reward": -0.7674560546875, "reward_std": 0.5319264531135559, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.144287109375, "rewards/ppl_reward/std": 3.38862943649292, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 2903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 149.34375, "completions/mean_terminated_length": 149.34375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 3.540968626256473, "grad_norm": 2.8648059368133545, "kl": 3.740234375, "learning_rate": 4.758690729504478e-06, "loss": 0.0892, "num_tokens": 51720996.0, "reward": -0.7164306640625, "reward_std": 0.5596590638160706, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.135986328125, "rewards/ppl_reward/std": 3.3229966163635254, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13524486124515533, "step": 2904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 148.90625, "completions/mean_terminated_length": 148.90625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.542187024063357, "grad_norm": 2.5907368659973145, "kl": 3.2509765625, "learning_rate": 4.751441955863501e-06, "loss": 0.1564, "num_tokens": 51738030.0, "reward": -0.64208984375, "reward_std": 0.4787597060203552, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.1201171875, "rewards/ppl_reward/std": 2.4121949672698975, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 2905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 137.3125, "completions/mean_terminated_length": 137.3125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.543405421870241, "grad_norm": 1.7194108963012695, "kl": 6.15234375, "learning_rate": 4.744196986629689e-06, "loss": 0.1877, "num_tokens": 51753618.0, "reward": -1.740234375, "reward_std": 0.73586106300354, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.96484375, "rewards/ppl_reward/std": 2.9962387084960938, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.180765300989151, "step": 2906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 150.609375, "completions/mean_terminated_length": 136.74603271484375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.5446238196771245, "grad_norm": 1.4951844215393066, "kl": 3.119140625, "learning_rate": 4.73695582705455e-06, "loss": 0.2734, "num_tokens": 51770225.0, "reward": -0.4224853515625, "reward_std": 0.4192401170730591, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.563720703125, "rewards/ppl_reward/std": 1.1604729890823364, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 2907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 141.953125, "completions/mean_terminated_length": 141.953125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 3.5458422174840085, "grad_norm": 1.900539755821228, "kl": 4.2412109375, "learning_rate": 4.729718482386819e-06, "loss": 0.2218, "num_tokens": 51786246.0, "reward": -1.8455810546875, "reward_std": 0.5613067150115967, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.339599609375, "rewards/ppl_reward/std": 3.7913012504577637, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11672777682542801, "step": 2908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 137.21875, "completions/mean_terminated_length": 137.21875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 3.5470606152908926, "grad_norm": 4.121294975280762, "kl": 4.3125, "learning_rate": 4.722484957872474e-06, "loss": 0.1134, "num_tokens": 51801132.0, "reward": -0.8759765625, "reward_std": 1.0309447050094604, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.509765625, "rewards/ppl_reward/std": 3.8410327434539795, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.17567719519138336, "step": 2909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 152.015625, "completions/mean_terminated_length": 152.015625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.548279013097776, "grad_norm": 2.3224756717681885, "kl": 5.7822265625, "learning_rate": 4.7152552587547204e-06, "loss": 0.2121, "num_tokens": 51818701.0, "reward": -0.9825439453125, "reward_std": 0.6625548601150513, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.496337890625, "rewards/ppl_reward/std": 1.6404438018798828, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.14773420989513397, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 127.21875, "completions/mean_terminated_length": 127.21875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.54949741090466, "grad_norm": 2.1257362365722656, "kl": 4.5703125, "learning_rate": 4.708029390273994e-06, "loss": 0.1236, "num_tokens": 51833971.0, "reward": -1.2164306640625, "reward_std": 0.9394538998603821, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.057861328125, "rewards/ppl_reward/std": 2.745483636856079, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1326993852853775, "step": 2911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 143.828125, "completions/mean_terminated_length": 143.828125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 3.5507158087115442, "grad_norm": 2.3319876194000244, "kl": 6.9609375, "learning_rate": 4.700807357667953e-06, "loss": 0.247, "num_tokens": 51850920.0, "reward": -1.98724365234375, "reward_std": 0.8150804042816162, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -7.3885498046875, "rewards/ppl_reward/std": 5.059706211090088, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.21532177925109863, "step": 2912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 145.578125, "completions/mean_terminated_length": 145.578125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 3.5519342065184283, "grad_norm": 1.9538087844848633, "kl": 5.1171875, "learning_rate": 4.693589166171466e-06, "loss": 0.2229, "num_tokens": 51867077.0, "reward": -1.0469970703125, "reward_std": 0.8627504110336304, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.679931640625, "rewards/ppl_reward/std": 2.588573932647705, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2004072666168213, "step": 2913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 138.15625, "completions/mean_terminated_length": 138.15625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 3.5531526043253123, "grad_norm": 1.5114179849624634, "kl": 6.3671875, "learning_rate": 4.686374821016636e-06, "loss": 0.2594, "num_tokens": 51883799.0, "reward": -0.3961181640625, "reward_std": 0.6473805904388428, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -4.354736328125, "rewards/ppl_reward/std": 1.2455236911773682, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1326993852853775, "step": 2914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 132.8125, "completions/mean_terminated_length": 132.8125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.5543710021321964, "grad_norm": 2.0195019245147705, "kl": 5.859375, "learning_rate": 4.679164327432756e-06, "loss": 0.2791, "num_tokens": 51899379.0, "reward": -1.580322265625, "reward_std": 0.8767064809799194, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.75439453125, "rewards/ppl_reward/std": 2.1090002059936523, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.139975905418396, "step": 2915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 134.890625, "completions/mean_terminated_length": 134.890625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 3.55558939993908, "grad_norm": 1.870126724243164, "kl": 6.955078125, "learning_rate": 4.6719576906463445e-06, "loss": 0.2717, "num_tokens": 51914852.0, "reward": -0.751220703125, "reward_std": 0.6462356448173523, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.04150390625, "rewards/ppl_reward/std": 1.8502461910247803, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.18729320168495178, "step": 2916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 130.28125, "completions/mean_terminated_length": 130.28125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.556807797745964, "grad_norm": 2.8940634727478027, "kl": 6.76171875, "learning_rate": 4.664754915881118e-06, "loss": 0.2169, "num_tokens": 51930302.0, "reward": -1.0838623046875, "reward_std": 1.3475240468978882, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.675537109375, "rewards/ppl_reward/std": 4.981364727020264, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.24672332406044006, "step": 2917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 140.390625, "completions/mean_terminated_length": 140.390625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 3.558026195552848, "grad_norm": 1.9225190877914429, "kl": 6.06640625, "learning_rate": 4.6575560083579915e-06, "loss": 0.2246, "num_tokens": 51946103.0, "reward": -2.19287109375, "reward_std": 1.8983783721923828, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -7.7998046875, "rewards/ppl_reward/std": 10.62131118774414, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16512493789196014, "step": 2918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 142.640625, "completions/mean_terminated_length": 142.640625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.559244593359732, "grad_norm": 2.0146822929382324, "kl": 4.43359375, "learning_rate": 4.650360973295086e-06, "loss": 0.1044, "num_tokens": 51962528.0, "reward": -1.370849609375, "reward_std": 0.6027098894119263, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.36669921875, "rewards/ppl_reward/std": 3.9113736152648926, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14689241349697113, "step": 2919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 151.140625, "completions/mean_terminated_length": 151.140625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 3.5604629911666157, "grad_norm": 2.4030919075012207, "kl": 5.7578125, "learning_rate": 4.6431698159077e-06, "loss": 0.23, "num_tokens": 51980673.0, "reward": -0.80712890625, "reward_std": 0.9395931363105774, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.0595703125, "rewards/ppl_reward/std": 1.9913158416748047, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2627868354320526, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 134.34375, "completions/mean_terminated_length": 134.34375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 3.5616813889734997, "grad_norm": 1.7179232835769653, "kl": 2.490234375, "learning_rate": 4.635982541408334e-06, "loss": 0.0596, "num_tokens": 51996615.0, "reward": -1.656005859375, "reward_std": 0.9551181793212891, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.06982421875, "rewards/ppl_reward/std": 6.808880805969238, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 2921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 138.546875, "completions/mean_terminated_length": 138.546875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.5628997867803838, "grad_norm": 1.526355504989624, "kl": 3.6474609375, "learning_rate": 4.628799155006669e-06, "loss": 0.1725, "num_tokens": 52012770.0, "reward": -0.8134765625, "reward_std": 0.8819411993026733, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.298828125, "rewards/ppl_reward/std": 3.5343363285064697, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19654127955436707, "step": 2922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 141.046875, "completions/mean_terminated_length": 141.046875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 3.564118184587268, "grad_norm": 2.176156520843506, "kl": 5.544921875, "learning_rate": 4.6216196619095745e-06, "loss": 0.2801, "num_tokens": 52028733.0, "reward": -6.5611572265625, "reward_std": 3.550490140914917, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -16.755126953125, "rewards/ppl_reward/std": 19.587234497070312, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1876239776611328, "step": 2923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 146.46875, "completions/mean_terminated_length": 146.46875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 3.565336582394152, "grad_norm": 2.517739772796631, "kl": 3.5224609375, "learning_rate": 4.614444067321083e-06, "loss": 0.185, "num_tokens": 52045411.0, "reward": -1.331298828125, "reward_std": 0.36100804805755615, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.53759765625, "rewards/ppl_reward/std": 6.3068671226501465, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 2924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 134.85714721679688, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 3.566554980201036, "grad_norm": 1.7874717712402344, "kl": 4.5244140625, "learning_rate": 4.607272376442415e-06, "loss": 0.2415, "num_tokens": 52061611.0, "reward": -1.30133056640625, "reward_std": 0.5564049482345581, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.2823486328125, "rewards/ppl_reward/std": 3.3337795734405518, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17354662716388702, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 153.703125, "completions/mean_terminated_length": 153.703125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 3.5677733780079195, "grad_norm": 2.2120368480682373, "kl": 4.24609375, "learning_rate": 4.6001045944719606e-06, "loss": 0.1919, "num_tokens": 52079040.0, "reward": -1.6722412109375, "reward_std": 0.7271075248718262, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -7.031982421875, "rewards/ppl_reward/std": 4.9568071365356445, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 2926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 146.078125, "completions/mean_terminated_length": 146.078125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 3.5689917758148035, "grad_norm": 2.1804792881011963, "kl": 2.3017578125, "learning_rate": 4.592940726605265e-06, "loss": 0.0898, "num_tokens": 52095957.0, "reward": -1.388427734375, "reward_std": 0.36079418659210205, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.66748046875, "rewards/ppl_reward/std": 5.274755477905273, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 2927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 149.078125, "completions/mean_terminated_length": 149.078125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.5702101736216876, "grad_norm": 1.6089279651641846, "kl": 1.6357421875, "learning_rate": 4.585780778035047e-06, "loss": -0.0363, "num_tokens": 52112754.0, "reward": -0.2022705078125, "reward_std": 0.29524916410446167, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -4.287353515625, "rewards/ppl_reward/std": 1.592167854309082, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 2928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 126.625, "completions/mean_terminated_length": 126.625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 3.571428571428571, "grad_norm": 1.8644633293151855, "kl": 3.0625, "learning_rate": 4.578624753951183e-06, "loss": 0.0561, "num_tokens": 52127298.0, "reward": -1.9613037109375, "reward_std": 1.9887993335723877, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.602294921875, "rewards/ppl_reward/std": 7.2582316398620605, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.20009763538837433, "step": 2929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 122.390625, "completions/mean_terminated_length": 122.390625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.572646969235455, "grad_norm": 3.1701176166534424, "kl": 5.98046875, "learning_rate": 4.571472659540702e-06, "loss": 0.1685, "num_tokens": 52141059.0, "reward": -5.41357421875, "reward_std": 1.891932487487793, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -14.1240234375, "rewards/ppl_reward/std": 7.451165676116943, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2478829026222229, "step": 2930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 122.140625, "completions/mean_terminated_length": 122.140625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.5738653670423393, "grad_norm": 2.3346095085144043, "kl": 3.28125, "learning_rate": 4.56432449998779e-06, "loss": 0.0634, "num_tokens": 52155468.0, "reward": -2.2156982421875, "reward_std": 0.8769192099571228, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.134521484375, "rewards/ppl_reward/std": 7.527133464813232, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 2931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 137.296875, "completions/mean_terminated_length": 137.296875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 3.5750837648492233, "grad_norm": 1.6360992193222046, "kl": 2.984375, "learning_rate": 4.557180280473773e-06, "loss": 0.0795, "num_tokens": 52171279.0, "reward": -1.970703125, "reward_std": 0.9722234010696411, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.69140625, "rewards/ppl_reward/std": 5.536156177520752, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 2932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 132.875, "completions/mean_terminated_length": 132.875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 3.5763021626561073, "grad_norm": 1.440224528312683, "kl": 4.060546875, "learning_rate": 4.550040006177121e-06, "loss": 0.0653, "num_tokens": 52186503.0, "reward": -2.02197265625, "reward_std": 1.4301550388336182, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -7.5751953125, "rewards/ppl_reward/std": 3.498763084411621, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18898223340511322, "step": 2933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 135.078125, "completions/mean_terminated_length": 135.078125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 3.5775205604629914, "grad_norm": 1.667380690574646, "kl": 3.7978515625, "learning_rate": 4.542903682273452e-06, "loss": 0.1288, "num_tokens": 52202228.0, "reward": -0.757568359375, "reward_std": 0.5514276027679443, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.24169921875, "rewards/ppl_reward/std": 1.338233232498169, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 2934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 143.1875, "completions/mean_terminated_length": 143.1875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 3.578738958269875, "grad_norm": 1.4023463726043701, "kl": 3.685546875, "learning_rate": 4.535771313935515e-06, "loss": 0.1258, "num_tokens": 52218784.0, "reward": -1.479736328125, "reward_std": 1.8598973751068115, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.63134765625, "rewards/ppl_reward/std": 6.695120811462402, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15570341050624847, "step": 2935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 139.4375, "completions/mean_terminated_length": 139.4375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 3.579957356076759, "grad_norm": 4.898397922515869, "kl": 3.701171875, "learning_rate": 4.528642906333196e-06, "loss": 0.0416, "num_tokens": 52234428.0, "reward": -1.0518798828125, "reward_std": 0.6715447902679443, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.853759765625, "rewards/ppl_reward/std": 2.3479433059692383, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 2936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 154.375, "completions/mean_terminated_length": 154.375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 3.581175753883643, "grad_norm": 2.366344451904297, "kl": 6.4921875, "learning_rate": 4.521518464633505e-06, "loss": 0.2129, "num_tokens": 52252308.0, "reward": -1.0279541015625, "reward_std": 1.0449093580245972, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.587158203125, "rewards/ppl_reward/std": 3.306708574295044, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19416078925132751, "step": 2937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 128.203125, "completions/mean_terminated_length": 128.203125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.582394151690527, "grad_norm": 1.8603051900863647, "kl": 4.640625, "learning_rate": 4.514397994000587e-06, "loss": 0.0443, "num_tokens": 52267001.0, "reward": -1.9481201171875, "reward_std": 1.0224709510803223, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.669677734375, "rewards/ppl_reward/std": 7.329573631286621, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 2938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 143.265625, "completions/mean_terminated_length": 143.265625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 3.5836125494974107, "grad_norm": 1.807100534439087, "kl": 4.42578125, "learning_rate": 4.507281499595689e-06, "loss": 0.1055, "num_tokens": 52283386.0, "reward": -1.81982421875, "reward_std": 1.5414398908615112, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.2646484375, "rewards/ppl_reward/std": 8.515013694763184, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 2939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 145.015625, "completions/mean_terminated_length": 145.015625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 3.5848309473042947, "grad_norm": 3.069669723510742, "kl": 5.328125, "learning_rate": 4.500168986577197e-06, "loss": 0.2245, "num_tokens": 52299691.0, "reward": -1.0743408203125, "reward_std": 0.6187816858291626, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.812744140625, "rewards/ppl_reward/std": 2.0147905349731445, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13768701255321503, "step": 2940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 135.921875, "completions/mean_terminated_length": 135.921875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.586049345111179, "grad_norm": 4.906494140625, "kl": 9.25, "learning_rate": 4.4930604601006025e-06, "loss": 0.3854, "num_tokens": 52314838.0, "reward": -4.0452880859375, "reward_std": 2.814523696899414, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -11.457763671875, "rewards/ppl_reward/std": 13.786090850830078, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.24241341650485992, "step": 2941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 144.9375, "completions/mean_terminated_length": 144.9375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 3.587267742918063, "grad_norm": 2.1074297428131104, "kl": 5.796875, "learning_rate": 4.485955925318501e-06, "loss": 0.2625, "num_tokens": 52330850.0, "reward": -1.9979248046875, "reward_std": 1.462119698524475, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -7.589599609375, "rewards/ppl_reward/std": 5.909310340881348, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19920477271080017, "step": 2942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 127.421875, "completions/mean_terminated_length": 127.421875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 3.588486140724947, "grad_norm": 3.430086374282837, "kl": 5.4140625, "learning_rate": 4.4788553873806054e-06, "loss": 0.1391, "num_tokens": 52345541.0, "reward": -0.43701171875, "reward_std": 0.8354284763336182, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.3896484375, "rewards/ppl_reward/std": 1.9210468530654907, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 2943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 131.421875, "completions/mean_terminated_length": 131.421875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.589704538531831, "grad_norm": 1.5889009237289429, "kl": 2.64453125, "learning_rate": 4.471758851433728e-06, "loss": 0.0823, "num_tokens": 52359904.0, "reward": -0.382080078125, "reward_std": 0.564508318901062, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -4.56884765625, "rewards/ppl_reward/std": 2.1537792682647705, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 2944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 155.53125, "completions/mean_terminated_length": 155.53125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 3.5909229363387145, "grad_norm": 2.0977401733398438, "kl": 5.46875, "learning_rate": 4.464666322621775e-06, "loss": 0.252, "num_tokens": 52377746.0, "reward": -1.837890625, "reward_std": 0.5264962911605835, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -7.47265625, "rewards/ppl_reward/std": 7.292431831359863, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 2945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 154.21875, "completions/mean_terminated_length": 154.21875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 3.5921413341455986, "grad_norm": 1.529627799987793, "kl": 4.80078125, "learning_rate": 4.457577806085754e-06, "loss": 0.2611, "num_tokens": 52394704.0, "reward": -1.399169921875, "reward_std": 0.605096161365509, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.60302734375, "rewards/ppl_reward/std": 4.638391494750977, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 2946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 146.203125, "completions/mean_terminated_length": 146.203125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 3.5933597319524826, "grad_norm": 2.4068562984466553, "kl": 4.71484375, "learning_rate": 4.4504933069637635e-06, "loss": 0.1775, "num_tokens": 52410925.0, "reward": -1.3336181640625, "reward_std": 0.9891514778137207, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.339111328125, "rewards/ppl_reward/std": 3.410564661026001, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 2947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 133.78125, "completions/mean_terminated_length": 133.78125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 3.594578129759366, "grad_norm": 1.4459608793258667, "kl": 2.69140625, "learning_rate": 4.443412830390988e-06, "loss": 0.004, "num_tokens": 52426527.0, "reward": -0.39404296875, "reward_std": 0.7565015554428101, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -4.4130859375, "rewards/ppl_reward/std": 2.027780294418335, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19416078925132751, "step": 2948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 143.59375, "completions/mean_terminated_length": 143.59375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 3.5957965275662502, "grad_norm": 2.1462717056274414, "kl": 2.4560546875, "learning_rate": 4.436336381499701e-06, "loss": 0.1331, "num_tokens": 52442813.0, "reward": -4.322265625, "reward_std": 2.5784404277801514, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -12.58203125, "rewards/ppl_reward/std": 25.2216739654541, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 2949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 136.484375, "completions/mean_terminated_length": 136.484375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.5970149253731343, "grad_norm": 1.982757806777954, "kl": 3.38671875, "learning_rate": 4.429263965419247e-06, "loss": 0.0304, "num_tokens": 52458772.0, "reward": -2.95703125, "reward_std": 0.647922694683075, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -9.6875, "rewards/ppl_reward/std": 9.155842781066895, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 2950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 135.765625, "completions/mean_terminated_length": 135.765625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 3.5982333231800183, "grad_norm": 1.9675297737121582, "kl": 4.5185546875, "learning_rate": 4.422195587276058e-06, "loss": 0.1465, "num_tokens": 52473989.0, "reward": -3.3095703125, "reward_std": 0.6247745752334595, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -10.283203125, "rewards/ppl_reward/std": 11.554545402526855, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13768701255321503, "step": 2951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 142.75, "completions/mean_terminated_length": 142.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.5994517209869024, "grad_norm": 1.7520073652267456, "kl": 3.134765625, "learning_rate": 4.41513125219363e-06, "loss": 0.0405, "num_tokens": 52490341.0, "reward": -1.0499267578125, "reward_std": 0.9439064860343933, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.693603515625, "rewards/ppl_reward/std": 3.074237585067749, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.22658175230026245, "step": 2952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 135.5, "completions/mean_terminated_length": 135.5, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 3.6006701187937864, "grad_norm": 1.6630439758300781, "kl": 5.15234375, "learning_rate": 4.408070965292534e-06, "loss": 0.1286, "num_tokens": 52505765.0, "reward": -1.343017578125, "reward_std": 0.9275233149528503, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.26416015625, "rewards/ppl_reward/std": 3.1254074573516846, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.13706642389297485, "step": 2953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 135.53125, "completions/mean_terminated_length": 135.53125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 3.60188851660067, "grad_norm": 2.6136696338653564, "kl": 5.42578125, "learning_rate": 4.401014731690405e-06, "loss": 0.0748, "num_tokens": 52521167.0, "reward": -1.308349609375, "reward_std": 1.0860416889190674, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.10107421875, "rewards/ppl_reward/std": 5.6269330978393555, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.21463678777217865, "step": 2954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 140.625, "completions/mean_terminated_length": 140.625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.603106914407554, "grad_norm": 2.247448205947876, "kl": 4.091796875, "learning_rate": 4.3939625565019415e-06, "loss": 0.08, "num_tokens": 52536887.0, "reward": -1.0057373046875, "reward_std": 0.7426096200942993, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.738037109375, "rewards/ppl_reward/std": 2.790120840072632, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 2955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 142.734375, "completions/mean_terminated_length": 142.734375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 3.604325312214438, "grad_norm": 2.0886569023132324, "kl": 4.3046875, "learning_rate": 4.3869144448389e-06, "loss": 0.1424, "num_tokens": 52553158.0, "reward": -1.2838134765625, "reward_std": 0.989128589630127, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -6.137939453125, "rewards/ppl_reward/std": 4.498444080352783, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.23854589462280273, "step": 2956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 150.90625, "completions/mean_terminated_length": 150.90625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.605543710021322, "grad_norm": 2.004484176635742, "kl": 6.1484375, "learning_rate": 4.379870401810092e-06, "loss": 0.2338, "num_tokens": 52570408.0, "reward": -1.7891845703125, "reward_std": 2.1898460388183594, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.093994140625, "rewards/ppl_reward/std": 9.584433555603027, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2366211861371994, "step": 2957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 155.796875, "completions/mean_terminated_length": 155.796875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.6067621078282057, "grad_norm": 1.8591824769973755, "kl": 3.685546875, "learning_rate": 4.372830432521377e-06, "loss": 0.0711, "num_tokens": 52587795.0, "reward": -1.9317626953125, "reward_std": 1.1699885129928589, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.426025390625, "rewards/ppl_reward/std": 6.032301902770996, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19416078925132751, "step": 2958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 133.421875, "completions/mean_terminated_length": 133.421875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 3.6079805056350898, "grad_norm": 1.296772837638855, "kl": 3.89453125, "learning_rate": 4.3657945420756575e-06, "loss": 0.0707, "num_tokens": 52602694.0, "reward": -1.6146240234375, "reward_std": 0.49537086486816406, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.994873046875, "rewards/ppl_reward/std": 7.182366371154785, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07344620674848557, "step": 2959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 133.859375, "completions/mean_terminated_length": 133.859375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.609198903441974, "grad_norm": 3.5510990619659424, "kl": 4.970703125, "learning_rate": 4.3587627355728904e-06, "loss": 0.0305, "num_tokens": 52617741.0, "reward": -4.36285400390625, "reward_std": 1.6380491256713867, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -12.0538330078125, "rewards/ppl_reward/std": 19.934850692749023, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2634054720401764, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 133.109375, "completions/mean_terminated_length": 133.109375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.610417301248858, "grad_norm": 1.5784920454025269, "kl": 2.365234375, "learning_rate": 4.351735018110066e-06, "loss": 0.0469, "num_tokens": 52632628.0, "reward": -1.7486572265625, "reward_std": 0.42356759309768677, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -7.380126953125, "rewards/ppl_reward/std": 7.061352252960205, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 2961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 167.96875, "completions/mean_terminated_length": 167.96875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 3.611635699055742, "grad_norm": 2.5050368309020996, "kl": 3.86328125, "learning_rate": 4.344711394781212e-06, "loss": 0.1045, "num_tokens": 52650290.0, "reward": -0.6522216796875, "reward_std": 0.4439605176448822, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.085693359375, "rewards/ppl_reward/std": 1.6460665464401245, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 2962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 137.53125, "completions/mean_terminated_length": 137.53125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.6128540968626255, "grad_norm": 1.8283953666687012, "kl": 5.8046875, "learning_rate": 4.337691870677393e-06, "loss": 0.1265, "num_tokens": 52665988.0, "reward": -0.83831787109375, "reward_std": 1.1197644472122192, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -5.1141357421875, "rewards/ppl_reward/std": 3.3568904399871826, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.22493386268615723, "step": 2963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 147.84375, "completions/mean_terminated_length": 147.84375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 3.6140724946695095, "grad_norm": 1.9925447702407837, "kl": 4.93359375, "learning_rate": 4.330676450886688e-06, "loss": 0.1695, "num_tokens": 52682594.0, "reward": 0.2105712890625, "reward_std": 0.2655770480632782, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -3.453857421875, "rewards/ppl_reward/std": 1.1413049697875977, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 2964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 143.875, "completions/mean_terminated_length": 143.875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 3.6152908924763936, "grad_norm": 2.2946176528930664, "kl": 3.70703125, "learning_rate": 4.323665140494217e-06, "loss": 0.0678, "num_tokens": 52699274.0, "reward": -0.654052734375, "reward_std": 0.5663050413131714, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.99560546875, "rewards/ppl_reward/std": 1.8197643756866455, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 2965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 144.375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.6165092902832776, "grad_norm": 1.530912160873413, "kl": 4.390625, "learning_rate": 4.316657944582112e-06, "loss": 0.1295, "num_tokens": 52715770.0, "reward": -1.468017578125, "reward_std": 0.8426393270492554, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.52197265625, "rewards/ppl_reward/std": 4.166300296783447, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17354662716388702, "step": 2966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 135.0625, "completions/mean_terminated_length": 135.0625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 3.617727688090161, "grad_norm": 2.6495723724365234, "kl": 3.33984375, "learning_rate": 4.3096548682295304e-06, "loss": 0.0731, "num_tokens": 52730950.0, "reward": -0.13623046875, "reward_std": 0.5614067316055298, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -3.8505859375, "rewards/ppl_reward/std": 1.3580042123794556, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.22479599714279175, "step": 2967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 144.578125, "completions/mean_terminated_length": 144.578125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 3.6189460858970453, "grad_norm": 3.1989991664886475, "kl": 6.111328125, "learning_rate": 4.302655916512631e-06, "loss": 0.2236, "num_tokens": 52747595.0, "reward": -6.3260498046875, "reward_std": 2.5753326416015625, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "rewards/ppl_reward/mean": -16.066162109375, "rewards/ppl_reward/std": 16.880207061767578, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2121305763721466, "step": 2968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 144.796875, "completions/mean_terminated_length": 144.796875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 3.6201644837039293, "grad_norm": 1.4943714141845703, "kl": 3.482421875, "learning_rate": 4.295661094504594e-06, "loss": 0.0948, "num_tokens": 52763726.0, "reward": -2.26708984375, "reward_std": 1.1283342838287354, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -8.2841796875, "rewards/ppl_reward/std": 6.442580699920654, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 2969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 161.5625, "completions/mean_terminated_length": 161.5625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 3.6213828815108133, "grad_norm": 1.3465802669525146, "kl": 2.2900390625, "learning_rate": 4.288670407275607e-06, "loss": 0.0185, "num_tokens": 52782690.0, "reward": -1.580078125, "reward_std": 0.4003031849861145, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -6.91015625, "rewards/ppl_reward/std": 4.112959384918213, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1985812783241272, "step": 2970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 149.296875, "completions/mean_terminated_length": 149.296875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 3.6226012793176974, "grad_norm": 1.8027194738388062, "kl": 6.4140625, "learning_rate": 4.281683859892849e-06, "loss": 0.2074, "num_tokens": 52799733.0, "reward": -0.922119140625, "reward_std": 1.036688208580017, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "rewards/ppl_reward/mean": -5.21923828125, "rewards/ppl_reward/std": 2.361050605773926, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1883249133825302, "step": 2971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 137.0625, "completions/mean_terminated_length": 137.0625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 3.6238196771245814, "grad_norm": 1.4970967769622803, "kl": 5.07421875, "learning_rate": 4.274701457420507e-06, "loss": 0.1229, "num_tokens": 52815097.0, "reward": -2.411376953125, "reward_std": 1.0596867799758911, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -8.36181640625, "rewards/ppl_reward/std": 6.13430118560791, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.21675680577754974, "step": 2972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 121.046875, "completions/mean_terminated_length": 121.046875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 3.625038074931465, "grad_norm": 1.7086045742034912, "kl": 4.78125, "learning_rate": 4.267723204919764e-06, "loss": -0.0185, "num_tokens": 52829260.0, "reward": -0.24151611328125, "reward_std": 1.165327787399292, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -3.8424072265625, "rewards/ppl_reward/std": 1.7251355648040771, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2987033724784851, "step": 2973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 151.90625, "completions/mean_terminated_length": 151.90625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 3.626256472738349, "grad_norm": 1.501664161682129, "kl": 3.41796875, "learning_rate": 4.260749107448792e-06, "loss": 0.0899, "num_tokens": 52846230.0, "reward": -1.2374267578125, "reward_std": 0.4526926875114441, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -6.162353515625, "rewards/ppl_reward/std": 3.1597371101379395, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14689241349697113, "step": 2974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 141.578125, "completions/mean_terminated_length": 141.578125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.627474870545233, "grad_norm": 1.7297515869140625, "kl": 3.1640625, "learning_rate": 4.253779170062753e-06, "loss": 0.0462, "num_tokens": 52862715.0, "reward": -0.9984130859375, "reward_std": 0.8445370197296143, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.676513671875, "rewards/ppl_reward/std": 2.894747495651245, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1791718602180481, "step": 2975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 151.265625, "completions/mean_terminated_length": 151.265625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 3.6286932683521167, "grad_norm": 1.3737539052963257, "kl": 2.318359375, "learning_rate": 4.2468133978137945e-06, "loss": -0.0062, "num_tokens": 52879572.0, "reward": -6.630615234375, "reward_std": 4.551175117492676, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -16.97998046875, "rewards/ppl_reward/std": 35.937705993652344, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.2314550280570984, "step": 2976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 137.640625, "completions/mean_terminated_length": 137.640625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.6299116661590007, "grad_norm": 2.0922698974609375, "kl": 3.0703125, "learning_rate": 4.239851795751041e-06, "loss": 0.0855, "num_tokens": 52896157.0, "reward": -0.9388427734375, "reward_std": 0.6020916700363159, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.557373046875, "rewards/ppl_reward/std": 2.4871103763580322, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 2977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 147.625, "completions/mean_terminated_length": 147.625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 3.631130063965885, "grad_norm": 1.3762701749801636, "kl": 2.314453125, "learning_rate": 4.232894368920592e-06, "loss": 0.0354, "num_tokens": 52912941.0, "reward": -0.83172607421875, "reward_std": 0.9653181433677673, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.3822021484375, "rewards/ppl_reward/std": 4.97899866104126, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 2978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 143.125, "completions/mean_terminated_length": 143.125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 3.632348461772769, "grad_norm": 1.8064488172531128, "kl": 2.0400390625, "learning_rate": 4.225941122365529e-06, "loss": 0.0375, "num_tokens": 52929789.0, "reward": -0.69921875, "reward_std": 0.3487791121006012, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.2578125, "rewards/ppl_reward/std": 3.1273462772369385, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 2979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 149.890625, "completions/mean_terminated_length": 149.890625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 3.633566859579653, "grad_norm": 1.6559078693389893, "kl": 3.45703125, "learning_rate": 4.218992061125899e-06, "loss": 0.0893, "num_tokens": 52946294.0, "reward": -1.1263427734375, "reward_std": 1.1444315910339355, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.979248046875, "rewards/ppl_reward/std": 3.513047456741333, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 2980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 144.15625, "completions/mean_terminated_length": 144.15625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 3.634785257386537, "grad_norm": 1.3058592081069946, "kl": 1.1806640625, "learning_rate": 4.212047190238716e-06, "loss": -0.0353, "num_tokens": 52962416.0, "reward": -1.2315673828125, "reward_std": 0.22888106107711792, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -6.424072265625, "rewards/ppl_reward/std": 3.3540451526641846, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 2981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 151.765625, "completions/mean_terminated_length": 151.765625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 3.6360036551934205, "grad_norm": 1.6049259901046753, "kl": 1.904296875, "learning_rate": 4.20510651473796e-06, "loss": 0.0117, "num_tokens": 52979081.0, "reward": 0.0003662109375, "reward_std": 0.3143371641635895, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -3.811767578125, "rewards/ppl_reward/std": 1.3055806159973145, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1298656165599823, "step": 2982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 146.171875, "completions/mean_terminated_length": 146.171875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.6372220530003045, "grad_norm": 2.4443771839141846, "kl": 4.5224609375, "learning_rate": 4.1981700396545576e-06, "loss": 0.0953, "num_tokens": 52995460.0, "reward": -0.6025390625, "reward_std": 0.5850852131843567, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -4.884765625, "rewards/ppl_reward/std": 2.4822824001312256, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.21445617079734802, "step": 2983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 149.671875, "completions/mean_terminated_length": 149.671875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.6384404508071886, "grad_norm": 3.6472597122192383, "kl": 3.6015625, "learning_rate": 4.191237770016404e-06, "loss": 0.0978, "num_tokens": 53012151.0, "reward": -0.5567626953125, "reward_std": 0.5524845123291016, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -4.840087890625, "rewards/ppl_reward/std": 1.8837835788726807, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1302827149629593, "step": 2984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 156.515625, "completions/mean_terminated_length": 156.515625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 3.6396588486140726, "grad_norm": 1.1837804317474365, "kl": 1.7607421875, "learning_rate": 4.1843097108483465e-06, "loss": 0.0016, "num_tokens": 53029552.0, "reward": -0.6873779296875, "reward_std": 0.27903109788894653, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -5.249755859375, "rewards/ppl_reward/std": 1.6974270343780518, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 2985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 147.90625, "completions/mean_terminated_length": 147.90625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.6408772464209562, "grad_norm": 1.2361270189285278, "kl": 2.38671875, "learning_rate": 4.177385867172166e-06, "loss": -0.0194, "num_tokens": 53046362.0, "reward": 0.0098876953125, "reward_std": 0.5341774821281433, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -3.722412109375, "rewards/ppl_reward/std": 1.1135529279708862, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 2986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 157.734375, "completions/mean_terminated_length": 157.734375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.6420956442278403, "grad_norm": 1.681734323501587, "kl": 2.10546875, "learning_rate": 4.170466244006601e-06, "loss": 0.053, "num_tokens": 53062985.0, "reward": -0.637451171875, "reward_std": 0.6129747629165649, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.07958984375, "rewards/ppl_reward/std": 1.9369072914123535, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09241779148578644, "step": 2987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 141.875, "completions/mean_terminated_length": 141.875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.6433140420347243, "grad_norm": 2.1287007331848145, "kl": 3.712890625, "learning_rate": 4.163550846367328e-06, "loss": 0.1021, "num_tokens": 53078441.0, "reward": -1.9437255859375, "reward_std": 0.9153406620025635, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -7.614013671875, "rewards/ppl_reward/std": 4.009295463562012, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 2988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 148.359375, "completions/mean_terminated_length": 148.359375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 3.6445324398416084, "grad_norm": 1.623345971107483, "kl": 3.37890625, "learning_rate": 4.1566396792669625e-06, "loss": 0.0182, "num_tokens": 53094840.0, "reward": -1.890380859375, "reward_std": 1.8300206661224365, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -7.31982421875, "rewards/ppl_reward/std": 5.756267070770264, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2212863266468048, "step": 2989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 153.734375, "completions/mean_terminated_length": 153.734375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 3.6457508376484924, "grad_norm": 2.749546766281128, "kl": 3.1513671875, "learning_rate": 4.149732747715044e-06, "loss": 0.1281, "num_tokens": 53111471.0, "reward": -0.958984375, "reward_std": 0.4112228751182556, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": -5.73828125, "rewards/ppl_reward/std": 4.133067607879639, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 2990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 161.515625, "completions/mean_terminated_length": 161.515625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 3.6469692354553764, "grad_norm": 2.280216932296753, "kl": 5.578125, "learning_rate": 4.142830056718052e-06, "loss": 0.1812, "num_tokens": 53128720.0, "reward": -2.8809814453125, "reward_std": 1.2699601650238037, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -9.379150390625, "rewards/ppl_reward/std": 8.004510879516602, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15545432269573212, "step": 2991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 146.9375, "completions/mean_terminated_length": 146.9375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 3.64818763326226, "grad_norm": 2.5850651264190674, "kl": 3.8671875, "learning_rate": 4.135931611279389e-06, "loss": -0.0012, "num_tokens": 53145340.0, "reward": -1.011962890625, "reward_std": 1.109238862991333, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.46923828125, "rewards/ppl_reward/std": 3.287188768386841, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.270231693983078, "step": 2992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 147.9375, "completions/mean_terminated_length": 147.9375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 3.649406031069144, "grad_norm": 3.3668315410614014, "kl": 7.359375, "learning_rate": 4.1290374163993785e-06, "loss": 0.2972, "num_tokens": 53162024.0, "reward": -1.092529296875, "reward_std": 1.262749195098877, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -5.70068359375, "rewards/ppl_reward/std": 2.7327003479003906, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.21463678777217865, "step": 2993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 146.34375, "completions/mean_terminated_length": 146.34375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.650624428876028, "grad_norm": 1.4319286346435547, "kl": 2.2705078125, "learning_rate": 4.12214747707527e-06, "loss": 0.0225, "num_tokens": 53178094.0, "reward": -0.68609619140625, "reward_std": 0.45988723635673523, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.0987548828125, "rewards/ppl_reward/std": 2.7556443214416504, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 2994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 164.703125, "completions/mean_terminated_length": 164.703125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 3.6518428266829117, "grad_norm": 2.3506112098693848, "kl": 4.99609375, "learning_rate": 4.115261798301213e-06, "loss": 0.226, "num_tokens": 53195731.0, "reward": -1.1513671875, "reward_std": 0.5409922003746033, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": -6.154296875, "rewards/ppl_reward/std": 5.167878150939941, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 2995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 162.84375, "completions/mean_terminated_length": 162.84375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 3.6530612244897958, "grad_norm": 1.970435380935669, "kl": 4.95703125, "learning_rate": 4.108380385068289e-06, "loss": 0.2153, "num_tokens": 53213369.0, "reward": -1.769775390625, "reward_std": 0.49882346391677856, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -7.25830078125, "rewards/ppl_reward/std": 3.076148509979248, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 2996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 166.984375, "completions/mean_terminated_length": 166.984375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 3.65427962229668, "grad_norm": 2.293652057647705, "kl": 2.2587890625, "learning_rate": 4.101503242364467e-06, "loss": 0.0616, "num_tokens": 53231568.0, "reward": -0.1724853515625, "reward_std": 0.17964263260364532, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": -4.266845703125, "rewards/ppl_reward/std": 1.7027082443237305, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 2997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 147.0, "completions/mean_terminated_length": 147.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 3.655498020103564, "grad_norm": 3.1076719760894775, "kl": 4.19921875, "learning_rate": 4.094630375174636e-06, "loss": 0.1592, "num_tokens": 53247728.0, "reward": -1.2227783203125, "reward_std": 0.7794523239135742, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.023681640625, "rewards/ppl_reward/std": 2.929230213165283, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16347388923168182, "step": 2998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 148.890625, "completions/mean_terminated_length": 148.890625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 3.656716417910448, "grad_norm": 2.4554965496063232, "kl": 6.3623046875, "learning_rate": 4.0877617884805785e-06, "loss": 0.2518, "num_tokens": 53263937.0, "reward": -2.87890625, "reward_std": 0.7571399807929993, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -9.4140625, "rewards/ppl_reward/std": 10.715198516845703, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 2999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 164.5625, "completions/mean_terminated_length": 164.5625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 3.657934815717332, "grad_norm": 1.3909066915512085, "kl": 3.390625, "learning_rate": 4.080897487260978e-06, "loss": 0.1162, "num_tokens": 53281237.0, "reward": -1.1766357421875, "reward_std": 0.507066011428833, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.025146484375, "rewards/ppl_reward/std": 4.3486714363098145, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11967839300632477, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 150.71875, "completions/mean_terminated_length": 150.71875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.6591532135242155, "grad_norm": 3.672211170196533, "kl": 6.9306640625, "learning_rate": 4.074037476491414e-06, "loss": 0.2453, "num_tokens": 53297539.0, "reward": -4.259765625, "reward_std": 1.0505428314208984, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -12.10546875, "rewards/ppl_reward/std": 10.382370948791504, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 3001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 180.140625, "completions/mean_terminated_length": 180.140625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.6603716113310996, "grad_norm": 3.1914520263671875, "kl": 7.341796875, "learning_rate": 4.067181761144346e-06, "loss": 0.3141, "num_tokens": 53317108.0, "reward": -1.022705078125, "reward_std": 1.0370036363601685, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -5.56103515625, "rewards/ppl_reward/std": 3.2227938175201416, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21114179491996765, "step": 3002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 142.734375, "completions/mean_terminated_length": 142.734375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 3.6615900091379836, "grad_norm": 2.677504539489746, "kl": 3.90625, "learning_rate": 4.060330346189125e-06, "loss": 0.1514, "num_tokens": 53332819.0, "reward": -1.091796875, "reward_std": 1.5635954141616821, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.76171875, "rewards/ppl_reward/std": 6.275195121765137, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19142712652683258, "step": 3003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 158.6875, "completions/mean_terminated_length": 158.6875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 3.6628084069448676, "grad_norm": 2.3393824100494385, "kl": 6.66015625, "learning_rate": 4.053483236591989e-06, "loss": 0.3342, "num_tokens": 53350015.0, "reward": -0.519287109375, "reward_std": 0.638074517250061, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -4.61669921875, "rewards/ppl_reward/std": 2.8207108974456787, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16347388923168182, "step": 3004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 137.015625, "completions/mean_terminated_length": 137.015625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.6640268047517512, "grad_norm": 1.7244093418121338, "kl": 3.703125, "learning_rate": 4.04664043731605e-06, "loss": 0.0344, "num_tokens": 53365360.0, "reward": -1.3543701171875, "reward_std": 1.1351522207260132, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": -6.255615234375, "rewards/ppl_reward/std": 3.584733009338379, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.18617255985736847, "step": 3005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 149.890625, "completions/mean_terminated_length": 149.890625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 3.6652452025586353, "grad_norm": 3.2908787727355957, "kl": 6.19921875, "learning_rate": 4.039801953321302e-06, "loss": 0.2092, "num_tokens": 53382673.0, "reward": -0.3974609375, "reward_std": 0.7094044089317322, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.263671875, "rewards/ppl_reward/std": 2.2051444053649902, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1985812783241272, "step": 3006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 138.234375, "completions/mean_terminated_length": 138.234375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 3.6664636003655193, "grad_norm": 5.8880462646484375, "kl": 6.5625, "learning_rate": 4.0329677895646035e-06, "loss": 0.2078, "num_tokens": 53397904.0, "reward": -1.7894287109375, "reward_std": 2.2314891815185547, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -7.039794921875, "rewards/ppl_reward/std": 6.960474491119385, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.18395289778709412, "step": 3007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 134.203125, "completions/mean_terminated_length": 134.203125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.6676819981724034, "grad_norm": 2.1352031230926514, "kl": 3.76953125, "learning_rate": 4.026137950999689e-06, "loss": 0.0257, "num_tokens": 53412989.0, "reward": -1.66357421875, "reward_std": 1.155853033065796, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.8193359375, "rewards/ppl_reward/std": 3.339477777481079, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.23168931901454926, "step": 3008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 149.09375, "completions/mean_terminated_length": 149.09375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 3.6689003959792874, "grad_norm": 2.210310459136963, "kl": 4.91796875, "learning_rate": 4.019312442577148e-06, "loss": 0.1331, "num_tokens": 53429491.0, "reward": -0.5081787109375, "reward_std": 1.038593053817749, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -4.477294921875, "rewards/ppl_reward/std": 2.358306407928467, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.23168931901454926, "step": 3009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 149.109375, "completions/mean_terminated_length": 149.109375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 3.6701187937861715, "grad_norm": 1.928673267364502, "kl": 3.009765625, "learning_rate": 4.01249126924444e-06, "loss": 0.0927, "num_tokens": 53446226.0, "reward": -1.1058349609375, "reward_std": 0.5968747735023499, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.961669921875, "rewards/ppl_reward/std": 3.292992115020752, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12198751419782639, "step": 3010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 134.828125, "completions/mean_terminated_length": 134.828125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 3.671337191593055, "grad_norm": 1.7343459129333496, "kl": 3.169921875, "learning_rate": 4.005674435945881e-06, "loss": 0.0373, "num_tokens": 53461111.0, "reward": -1.29541015625, "reward_std": 0.9628626108169556, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.0751953125, "rewards/ppl_reward/std": 3.07401967048645, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21114179491996765, "step": 3011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 141.96875, "completions/mean_terminated_length": 141.96875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.672555589399939, "grad_norm": 1.6146036386489868, "kl": 2.4072265625, "learning_rate": 3.998861947622635e-06, "loss": -0.0079, "num_tokens": 53477357.0, "reward": -1.541259765625, "reward_std": 0.6124660968780518, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -6.91845703125, "rewards/ppl_reward/std": 1.932040810585022, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 158.21875, "completions/mean_terminated_length": 158.21875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.673773987206823, "grad_norm": 2.2262508869171143, "kl": 7.40625, "learning_rate": 3.992053809212725e-06, "loss": 0.305, "num_tokens": 53494475.0, "reward": -2.7147216796875, "reward_std": 1.2367329597473145, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -8.929443359375, "rewards/ppl_reward/std": 7.852654457092285, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.16060402989387512, "step": 3013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 150.28125, "completions/mean_terminated_length": 150.28125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 3.6749923850137067, "grad_norm": 1.7122787237167358, "kl": 2.80078125, "learning_rate": 3.985250025651018e-06, "loss": 0.0748, "num_tokens": 53511373.0, "reward": -1.25732421875, "reward_std": 0.5535504817962646, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.2255859375, "rewards/ppl_reward/std": 2.4123685359954834, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 3014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 133.078125, "completions/mean_terminated_length": 133.078125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 3.6762107828205908, "grad_norm": 1.7421643733978271, "kl": 4.07421875, "learning_rate": 3.978450601869217e-06, "loss": 0.0045, "num_tokens": 53526498.0, "reward": -2.7947998046875, "reward_std": 1.2654170989990234, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -9.034912109375, "rewards/ppl_reward/std": 9.612581253051758, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2627868354320526, "step": 3015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 148.859375, "completions/mean_terminated_length": 148.859375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 3.677429180627475, "grad_norm": 4.309140682220459, "kl": 4.171875, "learning_rate": 3.971655542795876e-06, "loss": 0.0956, "num_tokens": 53543769.0, "reward": -1.29052734375, "reward_std": 0.6886367201805115, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -6.1982421875, "rewards/ppl_reward/std": 3.0199477672576904, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2052978277206421, "step": 3016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 138.84375, "completions/mean_terminated_length": 138.84375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 3.678647578434359, "grad_norm": 1.563970685005188, "kl": 3.10546875, "learning_rate": 3.964864853356378e-06, "loss": 0.0861, "num_tokens": 53559839.0, "reward": -1.0135498046875, "reward_std": 0.6766849756240845, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.761474609375, "rewards/ppl_reward/std": 2.877755641937256, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16194961965084076, "step": 3017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 137.640625, "completions/mean_terminated_length": 137.640625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.679865976241243, "grad_norm": 2.160811424255371, "kl": 1.998046875, "learning_rate": 3.958078538472944e-06, "loss": 0.0597, "num_tokens": 53575480.0, "reward": -1.03173828125, "reward_std": 0.5519840717315674, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": -5.8603515625, "rewards/ppl_reward/std": 3.0227456092834473, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 3018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 138.3125, "completions/mean_terminated_length": 138.3125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 3.681084374048127, "grad_norm": 2.1155612468719482, "kl": 3.8349609375, "learning_rate": 3.95129660306462e-06, "loss": 0.1876, "num_tokens": 53591364.0, "reward": -1.5146484375, "reward_std": 0.9429637789726257, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -6.740234375, "rewards/ppl_reward/std": 3.4802019596099854, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.19507674872875214, "step": 3019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 136.359375, "completions/mean_terminated_length": 136.359375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.6823027718550105, "grad_norm": 2.440946102142334, "kl": 3.3779296875, "learning_rate": 3.944519052047283e-06, "loss": 0.0516, "num_tokens": 53606635.0, "reward": -2.23974609375, "reward_std": 1.6737380027770996, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.0888671875, "rewards/ppl_reward/std": 5.402892112731934, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20638974010944366, "step": 3020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 155.5625, "completions/mean_terminated_length": 155.5625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 3.6835211696618946, "grad_norm": 1.8019312620162964, "kl": 5.1953125, "learning_rate": 3.937745890333623e-06, "loss": 0.1235, "num_tokens": 53624911.0, "reward": -1.4072265625, "reward_std": 0.8406714200973511, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -6.291015625, "rewards/ppl_reward/std": 2.6651875972747803, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.18729320168495178, "step": 3021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 158.15625, "completions/mean_terminated_length": 158.15625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 3.6847395674687786, "grad_norm": 2.020979881286621, "kl": 4.2177734375, "learning_rate": 3.930977122833151e-06, "loss": 0.0856, "num_tokens": 53642729.0, "reward": -0.81988525390625, "reward_std": 0.6087542772293091, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -5.2882080078125, "rewards/ppl_reward/std": 4.101986408233643, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16171015799045563, "step": 3022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 142.921875, "completions/mean_terminated_length": 142.921875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.6859579652756627, "grad_norm": 1.7645554542541504, "kl": 3.259765625, "learning_rate": 3.924212754452197e-06, "loss": 0.0403, "num_tokens": 53659324.0, "reward": -2.362548828125, "reward_std": 0.8086563348770142, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -8.33447265625, "rewards/ppl_reward/std": 5.651198387145996, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21114179491996765, "step": 3023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 136.09375, "completions/mean_terminated_length": 136.09375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.6871763630825463, "grad_norm": 1.9572635889053345, "kl": 4.23828125, "learning_rate": 3.917452790093901e-06, "loss": 0.1355, "num_tokens": 53674810.0, "reward": -4.72705078125, "reward_std": 1.5447865724563599, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -13.1103515625, "rewards/ppl_reward/std": 13.916797637939453, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09449111670255661, "step": 3024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 150.78125, "completions/mean_terminated_length": 150.78125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.6883947608894303, "grad_norm": 1.5656538009643555, "kl": 5.1953125, "learning_rate": 3.91069723465821e-06, "loss": 0.1881, "num_tokens": 53692348.0, "reward": -3.11407470703125, "reward_std": 1.4231497049331665, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": -9.8687744140625, "rewards/ppl_reward/std": 10.258295059204102, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 3025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 124.84375, "completions/mean_terminated_length": 124.84375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.6896131586963143, "grad_norm": 2.176873207092285, "kl": 4.025390625, "learning_rate": 3.903946093041877e-06, "loss": 0.1073, "num_tokens": 53706290.0, "reward": -0.892822265625, "reward_std": 0.5823583602905273, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": -5.52783203125, "rewards/ppl_reward/std": 3.155752182006836, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 3026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 141.046875, "completions/mean_terminated_length": 141.046875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 3.6908315565031984, "grad_norm": 2.721531391143799, "kl": 6.234375, "learning_rate": 3.897199370138456e-06, "loss": 0.2762, "num_tokens": 53722437.0, "reward": -1.34881591796875, "reward_std": 0.716838538646698, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": -6.2366943359375, "rewards/ppl_reward/std": 5.3278045654296875, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18225978314876556, "step": 3027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 149.421875, "completions/mean_terminated_length": 149.421875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 3.6920499543100824, "grad_norm": 1.098393201828003, "kl": 2.654296875, "learning_rate": 3.890457070838288e-06, "loss": 0.031, "num_tokens": 53739016.0, "reward": -0.79290771484375, "reward_std": 0.7928793430328369, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": -5.2108154296875, "rewards/ppl_reward/std": 3.474182605743408, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16592095792293549, "step": 3028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 139.921875, "completions/mean_terminated_length": 139.921875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 3.693268352116966, "grad_norm": 1.579316258430481, "kl": 3.79296875, "learning_rate": 3.8837192000285255e-06, "loss": 0.1209, "num_tokens": 53755459.0, "reward": -5.407470703125, "reward_std": 1.7282944917678833, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": -14.31494140625, "rewards/ppl_reward/std": 14.556315422058105, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17817416787147522, "step": 3029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 135.890625, "completions/mean_terminated_length": 135.890625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.69448674992385, "grad_norm": 1.9729030132293701, "kl": 3.966796875, "learning_rate": 3.876985762593091e-06, "loss": 0.1186, "num_tokens": 53770900.0, "reward": -0.802490234375, "reward_std": 0.8159666061401367, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": -5.34716796875, "rewards/ppl_reward/std": 3.399592161178589, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 3030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 136.28125, "completions/mean_terminated_length": 136.28125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 3.695705147730734, "grad_norm": 1.9194494485855103, "kl": 4.28125, "learning_rate": 3.87025676341271e-06, "loss": 0.053, "num_tokens": 53786590.0, "reward": -5.7586669921875, "reward_std": 0.9861401319503784, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": -15.001708984375, "rewards/ppl_reward/std": 26.854333877563477, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1508491188287735, "step": 3031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 144.921875, "completions/mean_terminated_length": 144.921875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.696923545537618, "grad_norm": 1.978165626525879, "kl": 4.86328125, "learning_rate": 3.8635322073648815e-06, "loss": 0.176, "num_tokens": 53803121.0, "reward": 1.76953125, "reward_std": 0.494045615196228, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.17102740705013275, "step": 3032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 129.453125, "completions/mean_terminated_length": 129.453125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.6981419433445017, "grad_norm": 0.9598228931427002, "kl": 3.171875, "learning_rate": 3.8568120993238936e-06, "loss": 0.0545, "num_tokens": 53818406.0, "reward": 1.84375, "reward_std": 0.3053697645664215, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 3033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 137.671875, "completions/mean_terminated_length": 137.671875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 3.699360341151386, "grad_norm": 1.910530686378479, "kl": 4.021484375, "learning_rate": 3.850096444160795e-06, "loss": 0.0935, "num_tokens": 53834089.0, "reward": 1.8203125, "reward_std": 0.4328922629356384, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.180765300989151, "step": 3034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 145.859375, "completions/mean_terminated_length": 145.859375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 3.70057873895827, "grad_norm": 1.5608655214309692, "kl": 2.37109375, "learning_rate": 3.8433852467434175e-06, "loss": 0.1072, "num_tokens": 53850392.0, "reward": 1.9140625, "reward_std": 0.24306795001029968, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 3035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 158.578125, "completions/mean_terminated_length": 158.578125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.701797136765154, "grad_norm": 1.1694271564483643, "kl": 3.8046875, "learning_rate": 3.8366785119363624e-06, "loss": 0.0807, "num_tokens": 53869317.0, "reward": 1.84765625, "reward_std": 0.37517493963241577, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 3036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 138.734375, "completions/mean_terminated_length": 138.734375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.703015534572038, "grad_norm": 2.510044574737549, "kl": 2.96484375, "learning_rate": 3.829976244600992e-06, "loss": 0.0612, "num_tokens": 53885412.0, "reward": 1.81640625, "reward_std": 0.4287659525871277, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.19283901154994965, "step": 3037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 136.953125, "completions/mean_terminated_length": 136.953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 3.704233932378922, "grad_norm": 2.056506395339966, "kl": 5.53515625, "learning_rate": 3.823278449595437e-06, "loss": 0.1588, "num_tokens": 53901057.0, "reward": 1.7265625, "reward_std": 0.48108887672424316, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2280818521976471, "step": 3038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 139.890625, "completions/mean_terminated_length": 139.890625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.7054523301858056, "grad_norm": 1.585465431213379, "kl": 3.0966796875, "learning_rate": 3.8165851317745705e-06, "loss": 0.0781, "num_tokens": 53917306.0, "reward": 1.91796875, "reward_std": 0.19048526883125305, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 3039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 122.46875, "completions/mean_terminated_length": 122.46875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.7066707279926896, "grad_norm": 3.5169870853424072, "kl": 4.94921875, "learning_rate": 3.809896295990042e-06, "loss": 0.0413, "num_tokens": 53931304.0, "reward": 1.72265625, "reward_std": 0.6058717966079712, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.24138814210891724, "step": 3040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 176.359375, "completions/mean_terminated_length": 162.90476989746094, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 3.7078891257995736, "grad_norm": 1.2405619621276855, "kl": 3.1328125, "learning_rate": 3.803211947090232e-06, "loss": 0.17, "num_tokens": 53951015.0, "reward": 1.86328125, "reward_std": 0.3425048589706421, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 3041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 128.859375, "completions/mean_terminated_length": 128.859375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 3.7091075236064572, "grad_norm": 4.832205295562744, "kl": 6.08203125, "learning_rate": 3.7965320899202816e-06, "loss": 0.1232, "num_tokens": 53965958.0, "reward": 1.73046875, "reward_std": 0.5133610963821411, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1953943371772766, "step": 3042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 154.515625, "completions/mean_terminated_length": 154.515625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 3.7103259214133413, "grad_norm": 1.098107099533081, "kl": 2.8095703125, "learning_rate": 3.7898567293220724e-06, "loss": 0.0951, "num_tokens": 53983271.0, "reward": 1.92578125, "reward_std": 0.20992232859134674, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 3043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 145.546875, "completions/mean_terminated_length": 145.546875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 3.7115443192202253, "grad_norm": 1.1675682067871094, "kl": 3.609375, "learning_rate": 3.7831858701342283e-06, "loss": 0.0827, "num_tokens": 53999802.0, "reward": 1.85546875, "reward_std": 0.370640367269516, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 3044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 144.640625, "completions/mean_terminated_length": 144.640625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 3.7127627170271094, "grad_norm": 1.4178305864334106, "kl": 2.28125, "learning_rate": 3.7765195171921098e-06, "loss": 0.0549, "num_tokens": 54016139.0, "reward": 1.88671875, "reward_std": 0.286601185798645, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 3045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 114.671875, "completions/mean_terminated_length": 114.671875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 3.7139811148339934, "grad_norm": 1.8330159187316895, "kl": 4.26171875, "learning_rate": 3.769857675327804e-06, "loss": 0.0289, "num_tokens": 54029678.0, "reward": 1.76171875, "reward_std": 0.5095065832138062, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19024936854839325, "step": 3046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 139.5625, "completions/mean_terminated_length": 139.5625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 3.7151995126408774, "grad_norm": 1.8815574645996094, "kl": 4.13671875, "learning_rate": 3.763200349370141e-06, "loss": 0.1057, "num_tokens": 54045466.0, "reward": 1.81640625, "reward_std": 0.36594951152801514, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 3047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 130.59375, "completions/mean_terminated_length": 130.59375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.716417910447761, "grad_norm": 2.9559738636016846, "kl": 5.32421875, "learning_rate": 3.756547544144664e-06, "loss": 0.2211, "num_tokens": 54060256.0, "reward": 1.828125, "reward_std": 0.38768914341926575, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1717960685491562, "step": 3048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 138.953125, "completions/mean_terminated_length": 138.953125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 3.717636308254645, "grad_norm": 1.7174354791641235, "kl": 3.876953125, "learning_rate": 3.7498992644736487e-06, "loss": 0.1584, "num_tokens": 54076557.0, "reward": 1.91015625, "reward_std": 0.22030992805957794, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 3049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 146.09375, "completions/mean_terminated_length": 146.09375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 3.718854706061529, "grad_norm": 1.2975670099258423, "kl": 5.2607421875, "learning_rate": 3.7432555151760896e-06, "loss": 0.2533, "num_tokens": 54093267.0, "reward": 1.8515625, "reward_std": 0.30634817481040955, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 3050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 139.921875, "completions/mean_terminated_length": 139.921875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 3.720073103868413, "grad_norm": 1.15662682056427, "kl": 4.1904296875, "learning_rate": 3.7366163010676937e-06, "loss": 0.2015, "num_tokens": 54109006.0, "reward": 1.890625, "reward_std": 0.24174606800079346, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 3051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 129.109375, "completions/mean_terminated_length": 129.109375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 3.7212915016752968, "grad_norm": 1.5232754945755005, "kl": 3.75390625, "learning_rate": 3.729981626960887e-06, "loss": 0.0805, "num_tokens": 54123861.0, "reward": 1.84765625, "reward_std": 0.34540891647338867, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15782932937145233, "step": 3052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 149.90625, "completions/mean_terminated_length": 149.90625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.722509899482181, "grad_norm": 1.585644006729126, "kl": 3.419921875, "learning_rate": 3.723351497664792e-06, "loss": 0.06, "num_tokens": 54140807.0, "reward": 1.83984375, "reward_std": 0.3150845766067505, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.19507674872875214, "step": 3053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 130.28125, "completions/mean_terminated_length": 130.28125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 3.723728297289065, "grad_norm": 2.2661049365997314, "kl": 4.92578125, "learning_rate": 3.71672591798525e-06, "loss": 0.2059, "num_tokens": 54155969.0, "reward": 1.87890625, "reward_std": 0.2432999312877655, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 3054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 145.5625, "completions/mean_terminated_length": 145.5625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 3.724946695095949, "grad_norm": 1.034118413925171, "kl": 3.0947265625, "learning_rate": 3.710104892724803e-06, "loss": 0.0393, "num_tokens": 54172701.0, "reward": 1.81640625, "reward_std": 0.32943665981292725, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1876239776611328, "step": 3055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 125.015625, "completions/mean_terminated_length": 125.015625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.726165092902833, "grad_norm": 1.7658166885375977, "kl": 2.421875, "learning_rate": 3.703488426682681e-06, "loss": 0.0892, "num_tokens": 54187310.0, "reward": 1.93359375, "reward_std": 0.1497059315443039, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1118449866771698, "step": 3056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 135.21875, "completions/mean_terminated_length": 135.21875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.727383490709717, "grad_norm": 1.0493816137313843, "kl": 2.6640625, "learning_rate": 3.69687652465482e-06, "loss": 0.0617, "num_tokens": 54202772.0, "reward": 1.88671875, "reward_std": 0.24844545125961304, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 3057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 118.78125, "completions/mean_terminated_length": 118.78125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.7286018885166006, "grad_norm": 1.5154370069503784, "kl": 2.60546875, "learning_rate": 3.690269191433845e-06, "loss": 0.0579, "num_tokens": 54216710.0, "reward": 1.87109375, "reward_std": 0.2926396131515503, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 3058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 141.8125, "completions/mean_terminated_length": 141.8125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.7298202863234846, "grad_norm": 1.6791976690292358, "kl": 3.7177734375, "learning_rate": 3.683666431809071e-06, "loss": 0.205, "num_tokens": 54232818.0, "reward": 1.88671875, "reward_std": 0.252518892288208, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09241779148578644, "step": 3059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 130.265625, "completions/mean_terminated_length": 130.265625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 3.7310386841303687, "grad_norm": 1.9653418064117432, "kl": 2.685546875, "learning_rate": 3.6770682505664903e-06, "loss": 0.1521, "num_tokens": 54247875.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 3060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 134.734375, "completions/mean_terminated_length": 134.734375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 3.7322570819372523, "grad_norm": 0.6835494637489319, "kl": 1.7421875, "learning_rate": 3.6704746524887835e-06, "loss": 0.0429, "num_tokens": 54263394.0, "reward": 1.92578125, "reward_std": 0.16413544118404388, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1269075721502304, "step": 3061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 119.15625, "completions/mean_terminated_length": 119.15625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.7334754797441363, "grad_norm": 1.4464293718338013, "kl": 1.919921875, "learning_rate": 3.663885642355309e-06, "loss": 0.0429, "num_tokens": 54277364.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 140.609375, "completions/mean_terminated_length": 140.609375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 3.7346938775510203, "grad_norm": 1.8797616958618164, "kl": 5.9375, "learning_rate": 3.657301224942098e-06, "loss": 0.3344, "num_tokens": 54293795.0, "reward": 1.8984375, "reward_std": 0.2534555494785309, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 138.625, "completions/mean_terminated_length": 138.625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 3.7359122753579044, "grad_norm": 1.6988937854766846, "kl": 3.947265625, "learning_rate": 3.6507214050218544e-06, "loss": 0.1518, "num_tokens": 54310051.0, "reward": 1.87109375, "reward_std": 0.28144800662994385, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 3064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 135.703125, "completions/mean_terminated_length": 135.703125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 3.7371306731647884, "grad_norm": 0.839477002620697, "kl": 2.17578125, "learning_rate": 3.644146187363947e-06, "loss": 0.1124, "num_tokens": 54326448.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 128.671875, "completions/mean_terminated_length": 128.671875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.7383490709716725, "grad_norm": 1.4535161256790161, "kl": 3.763671875, "learning_rate": 3.6375755767344047e-06, "loss": 0.1634, "num_tokens": 54340931.0, "reward": 1.84375, "reward_std": 0.3516421914100647, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1534975916147232, "step": 3066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 124.703125, "completions/mean_terminated_length": 124.703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 3.739567468778556, "grad_norm": 3.6086277961730957, "kl": 3.7265625, "learning_rate": 3.6310095778959253e-06, "loss": 0.0747, "num_tokens": 54355920.0, "reward": 1.765625, "reward_std": 0.43870699405670166, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2221602201461792, "step": 3067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 129.390625, "completions/mean_terminated_length": 129.390625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.74078586658544, "grad_norm": 1.4992573261260986, "kl": 5.78515625, "learning_rate": 3.6244481956078605e-06, "loss": 0.2447, "num_tokens": 54371233.0, "reward": 1.8359375, "reward_std": 0.38434892892837524, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12769903242588043, "step": 3068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 117.125, "completions/mean_terminated_length": 117.125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 3.742004264392324, "grad_norm": 1.9717546701431274, "kl": 4.2734375, "learning_rate": 3.6178914346262163e-06, "loss": 0.1602, "num_tokens": 54385569.0, "reward": 1.90625, "reward_std": 0.223630890250206, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 3069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 127.265625, "completions/mean_terminated_length": 127.265625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 3.743222662199208, "grad_norm": 1.7471976280212402, "kl": 4.693359375, "learning_rate": 3.6113392997036476e-06, "loss": 0.1414, "num_tokens": 54400962.0, "reward": 1.82421875, "reward_std": 0.3583110570907593, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17354662716388702, "step": 3070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 135.78125, "completions/mean_terminated_length": 135.78125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 3.744441060006092, "grad_norm": 1.5607404708862305, "kl": 4.625, "learning_rate": 3.6047917955894606e-06, "loss": 0.2357, "num_tokens": 54417564.0, "reward": 1.88671875, "reward_std": 0.30856597423553467, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 3071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 130.296875, "completions/mean_terminated_length": 130.296875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.745659457812976, "grad_norm": 1.6841562986373901, "kl": 3.93359375, "learning_rate": 3.598248927029594e-06, "loss": 0.1218, "num_tokens": 54433031.0, "reward": 1.87109375, "reward_std": 0.32644620537757874, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 3072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 129.078125, "completions/mean_terminated_length": 129.078125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 3.74687785561986, "grad_norm": 1.6910449266433716, "kl": 2.6083984375, "learning_rate": 3.5917106987666393e-06, "loss": 0.0425, "num_tokens": 54448372.0, "reward": 1.89453125, "reward_std": 0.2171969711780548, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 3073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 118.046875, "completions/mean_terminated_length": 118.046875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.748096253426744, "grad_norm": 1.2043806314468384, "kl": 2.64453125, "learning_rate": 3.585177115539814e-06, "loss": 0.0757, "num_tokens": 54462543.0, "reward": 1.9453125, "reward_std": 0.15467961132526398, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 142.234375, "completions/mean_terminated_length": 142.234375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 3.749314651233628, "grad_norm": 1.9849958419799805, "kl": 3.6796875, "learning_rate": 3.578648182084975e-06, "loss": 0.2302, "num_tokens": 54479174.0, "reward": 1.92578125, "reward_std": 0.20992231369018555, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 3075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 128.796875, "completions/mean_terminated_length": 128.796875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.750533049040512, "grad_norm": 0.8585145473480225, "kl": 1.888671875, "learning_rate": 3.5721239031346067e-06, "loss": 0.0324, "num_tokens": 54494857.0, "reward": 1.88671875, "reward_std": 0.2067292183637619, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09241779148578644, "step": 3076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 122.859375, "completions/mean_terminated_length": 122.859375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 3.7517514468473956, "grad_norm": 1.3501991033554077, "kl": 2.5078125, "learning_rate": 3.5656042834178216e-06, "loss": 0.0513, "num_tokens": 54509440.0, "reward": 1.921875, "reward_std": 0.20902998745441437, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 3077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 115.984375, "completions/mean_terminated_length": 115.984375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 3.7529698446542796, "grad_norm": 1.714967966079712, "kl": 4.068359375, "learning_rate": 3.5590893276603565e-06, "loss": 0.1306, "num_tokens": 54523399.0, "reward": 1.80859375, "reward_std": 0.3927299976348877, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10789459943771362, "step": 3078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 115.859375, "completions/mean_terminated_length": 115.859375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.7541882424611637, "grad_norm": 1.0449674129486084, "kl": 1.5966796875, "learning_rate": 3.552579040584557e-06, "loss": 0.0404, "num_tokens": 54537254.0, "reward": 1.91796875, "reward_std": 0.1938636600971222, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 3079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 134.390625, "completions/mean_terminated_length": 134.390625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.7554066402680473, "grad_norm": 0.9274829030036926, "kl": 1.708984375, "learning_rate": 3.5460734269093968e-06, "loss": 0.0399, "num_tokens": 54553535.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 3080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 127.484375, "completions/mean_terminated_length": 127.484375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.7566250380749313, "grad_norm": 3.2202701568603516, "kl": 4.703125, "learning_rate": 3.5395724913504546e-06, "loss": 0.1476, "num_tokens": 54569102.0, "reward": 1.8046875, "reward_std": 0.3455796539783478, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14919592440128326, "step": 3081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 115.0, "completions/mean_terminated_length": 115.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.7578434358818154, "grad_norm": 2.830603837966919, "kl": 3.75, "learning_rate": 3.533076238619928e-06, "loss": 0.1317, "num_tokens": 54583414.0, "reward": 1.7890625, "reward_std": 0.4539563059806824, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16943387687206268, "step": 3082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 125.84375, "completions/mean_terminated_length": 125.84375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.7590618336886994, "grad_norm": 1.7656432390213013, "kl": 3.7421875, "learning_rate": 3.526584673426604e-06, "loss": 0.0886, "num_tokens": 54598700.0, "reward": 1.796875, "reward_std": 0.3655853271484375, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17251639068126678, "step": 3083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 126.59375, "completions/mean_terminated_length": 126.59375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 3.7602802314955834, "grad_norm": 1.2668434381484985, "kl": 3.4453125, "learning_rate": 3.520097800475889e-06, "loss": 0.1279, "num_tokens": 54613738.0, "reward": 1.9140625, "reward_std": 0.20926138758659363, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 128.234375, "completions/mean_terminated_length": 128.234375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 3.7614986293024675, "grad_norm": 1.5450797080993652, "kl": 3.84765625, "learning_rate": 3.513615624469774e-06, "loss": 0.178, "num_tokens": 54629177.0, "reward": 1.90234375, "reward_std": 0.24240700900554657, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 3085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 121.796875, "completions/mean_terminated_length": 121.796875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.762717027109351, "grad_norm": 1.9168256521224976, "kl": 3.740234375, "learning_rate": 3.5071381501068536e-06, "loss": 0.198, "num_tokens": 54643700.0, "reward": 1.8984375, "reward_std": 0.24306795001029968, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11545931547880173, "step": 3086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 117.46875, "completions/mean_terminated_length": 117.46875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.763935424916235, "grad_norm": 1.0372066497802734, "kl": 2.1240234375, "learning_rate": 3.500665382082313e-06, "loss": 0.0877, "num_tokens": 54657554.0, "reward": 1.9375, "reward_std": 0.1767767071723938, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 3087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 117.953125, "completions/mean_terminated_length": 117.953125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 3.765153822723119, "grad_norm": 1.4138935804367065, "kl": 4.41796875, "learning_rate": 3.494197325087927e-06, "loss": 0.1567, "num_tokens": 54671999.0, "reward": 1.8671875, "reward_std": 0.3186172842979431, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 3088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 105.40625, "completions/mean_terminated_length": 105.40625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 3.766372220530003, "grad_norm": 2.7179417610168457, "kl": 2.556640625, "learning_rate": 3.487733983812053e-06, "loss": 0.1353, "num_tokens": 54685201.0, "reward": 1.91015625, "reward_std": 0.2541164755821228, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 3089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 116.859375, "completions/mean_terminated_length": 116.859375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.767590618336887, "grad_norm": 1.9210180044174194, "kl": 4.49609375, "learning_rate": 3.4812753629396346e-06, "loss": 0.1808, "num_tokens": 54699384.0, "reward": 1.92578125, "reward_std": 0.17367032170295715, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 128.6875, "completions/mean_terminated_length": 128.6875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.768809016143771, "grad_norm": 1.9760512113571167, "kl": 4.6171875, "learning_rate": 3.4748214671521875e-06, "loss": 0.2142, "num_tokens": 54714788.0, "reward": 1.83203125, "reward_std": 0.38343530893325806, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15141324698925018, "step": 3091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 125.796875, "completions/mean_terminated_length": 125.796875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 3.770027413950655, "grad_norm": 2.1942954063415527, "kl": 3.765625, "learning_rate": 3.4683723011278024e-06, "loss": 0.0901, "num_tokens": 54731247.0, "reward": 1.859375, "reward_std": 0.3043211102485657, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1598300188779831, "step": 3092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 117.078125, "completions/mean_terminated_length": 117.078125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 3.771245811757539, "grad_norm": 2.56199049949646, "kl": 5.0390625, "learning_rate": 3.4619278695411495e-06, "loss": 0.212, "num_tokens": 54745780.0, "reward": 1.87109375, "reward_std": 0.24975988268852234, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 3093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 108.640625, "completions/mean_terminated_length": 108.640625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 3.772464209564423, "grad_norm": 1.3831177949905396, "kl": 3.060546875, "learning_rate": 3.4554881770634605e-06, "loss": 0.1354, "num_tokens": 54758909.0, "reward": 1.9140625, "reward_std": 0.18609270453453064, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 3094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 135.484375, "completions/mean_terminated_length": 135.484375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 3.773682607371307, "grad_norm": 1.2000179290771484, "kl": 4.78515625, "learning_rate": 3.4490532283625355e-06, "loss": 0.2608, "num_tokens": 54774324.0, "reward": 1.859375, "reward_std": 0.3180941045284271, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17747680842876434, "step": 3095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 128.390625, "completions/mean_terminated_length": 128.390625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.7749010051781906, "grad_norm": 1.1318820714950562, "kl": 2.38671875, "learning_rate": 3.4426230281027374e-06, "loss": 0.1079, "num_tokens": 54789453.0, "reward": 1.921875, "reward_std": 0.18281513452529907, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 3096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 120.265625, "completions/mean_terminated_length": 120.265625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.7761194029850746, "grad_norm": 2.1162383556365967, "kl": 6.125, "learning_rate": 3.4361975809449766e-06, "loss": 0.335, "num_tokens": 54803878.0, "reward": 1.87890625, "reward_std": 0.2890298068523407, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 3097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 121.359375, "completions/mean_terminated_length": 121.359375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.7773378007919587, "grad_norm": 2.202695846557617, "kl": 4.09375, "learning_rate": 3.429776891546732e-06, "loss": 0.1507, "num_tokens": 54818685.0, "reward": 1.90234375, "reward_std": 0.24240700900554657, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 3098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 124.5625, "completions/mean_terminated_length": 124.5625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.7785561985988423, "grad_norm": 1.3172687292099, "kl": 2.44921875, "learning_rate": 3.423360964562026e-06, "loss": 0.1708, "num_tokens": 54833369.0, "reward": 1.921875, "reward_std": 0.22097086906433105, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 3099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 130.34375, "completions/mean_terminated_length": 130.34375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.7797745964057263, "grad_norm": 1.6525683403015137, "kl": 4.41796875, "learning_rate": 3.4169498046414353e-06, "loss": 0.2266, "num_tokens": 54849087.0, "reward": 1.88671875, "reward_std": 0.24181771278381348, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06762243062257767, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 124.046875, "completions/mean_terminated_length": 124.046875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 3.7809929942126104, "grad_norm": 1.0332123041152954, "kl": 2.7841796875, "learning_rate": 3.4105434164320695e-06, "loss": 0.0775, "num_tokens": 54863906.0, "reward": 1.87890625, "reward_std": 0.2647186815738678, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 3101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 115.578125, "completions/mean_terminated_length": 115.578125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.7822113920194944, "grad_norm": 1.7912219762802124, "kl": 4.462890625, "learning_rate": 3.40414180457759e-06, "loss": 0.1646, "num_tokens": 54877879.0, "reward": 1.8671875, "reward_std": 0.37565046548843384, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15570341050624847, "step": 3102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 125.78125, "completions/mean_terminated_length": 125.78125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 3.7834297898263785, "grad_norm": 1.2855079174041748, "kl": 2.5771484375, "learning_rate": 3.3977449737181955e-06, "loss": 0.102, "num_tokens": 54893265.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 128.5, "completions/mean_terminated_length": 128.5, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 3.7846481876332625, "grad_norm": 1.2581263780593872, "kl": 4.05078125, "learning_rate": 3.391352928490611e-06, "loss": 0.2372, "num_tokens": 54908833.0, "reward": 1.9453125, "reward_std": 0.15467961132526398, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 120.359375, "completions/mean_terminated_length": 120.359375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 3.785866585440146, "grad_norm": 1.5451666116714478, "kl": 3.859375, "learning_rate": 3.3849656735281023e-06, "loss": 0.1953, "num_tokens": 54923520.0, "reward": 1.87109375, "reward_std": 0.3230677843093872, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1249379813671112, "step": 3105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 122.578125, "completions/mean_terminated_length": 122.578125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.78708498324703, "grad_norm": 1.3842123746871948, "kl": 3.697265625, "learning_rate": 3.3785832134604557e-06, "loss": 0.1031, "num_tokens": 54938461.0, "reward": 1.85546875, "reward_std": 0.32866811752319336, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 3106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 114.5, "completions/mean_terminated_length": 114.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 3.788303381053914, "grad_norm": 1.25386381149292, "kl": 2.55078125, "learning_rate": 3.3722055529139887e-06, "loss": 0.1147, "num_tokens": 54952525.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08768405020236969, "step": 3107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 120.203125, "completions/mean_terminated_length": 120.203125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.7895217788607978, "grad_norm": 1.3383368253707886, "kl": 3.265625, "learning_rate": 3.3658326965115372e-06, "loss": 0.1088, "num_tokens": 54967306.0, "reward": 1.91015625, "reward_std": 0.2541165053844452, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 3108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 115.8125, "completions/mean_terminated_length": 115.8125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.790740176667682, "grad_norm": 2.727271556854248, "kl": 5.4755859375, "learning_rate": 3.359464648872448e-06, "loss": 0.3028, "num_tokens": 54981422.0, "reward": 1.87890625, "reward_std": 0.2656588554382324, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 3109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 119.078125, "completions/mean_terminated_length": 119.078125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.791958574474566, "grad_norm": 1.3771227598190308, "kl": 2.818359375, "learning_rate": 3.353101414612594e-06, "loss": 0.128, "num_tokens": 54995835.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 116.0, "completions/mean_terminated_length": 116.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.79317697228145, "grad_norm": 2.147578239440918, "kl": 3.92578125, "learning_rate": 3.3467429983443477e-06, "loss": 0.2673, "num_tokens": 55010275.0, "reward": 1.91796875, "reward_std": 0.23201940953731537, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 3111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 119.96875, "completions/mean_terminated_length": 119.96875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.794395370088334, "grad_norm": 1.9058259725570679, "kl": 3.642578125, "learning_rate": 3.3403894046765984e-06, "loss": 0.2407, "num_tokens": 55025017.0, "reward": 1.8828125, "reward_std": 0.24147525429725647, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1423913538455963, "step": 3112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 126.6875, "completions/mean_terminated_length": 126.6875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 3.795613767895218, "grad_norm": 1.5119701623916626, "kl": 2.7724609375, "learning_rate": 3.334040638214735e-06, "loss": 0.1626, "num_tokens": 55040869.0, "reward": 1.93359375, "reward_std": 0.18782523274421692, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09241779148578644, "step": 3113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 127.484375, "completions/mean_terminated_length": 127.484375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.7968321657021016, "grad_norm": 1.8151075839996338, "kl": 1.966796875, "learning_rate": 3.32769670356065e-06, "loss": 0.0787, "num_tokens": 55056876.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 107.578125, "completions/mean_terminated_length": 107.578125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.7980505635089856, "grad_norm": 1.351534366607666, "kl": 2.6328125, "learning_rate": 3.321357605312734e-06, "loss": 0.1175, "num_tokens": 55070257.0, "reward": 1.91015625, "reward_std": 0.2541164755821228, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 3115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 114.15625, "completions/mean_terminated_length": 114.15625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 3.7992689613158697, "grad_norm": 1.1767619848251343, "kl": 2.8671875, "learning_rate": 3.315023348065863e-06, "loss": 0.0793, "num_tokens": 55084259.0, "reward": 1.91796875, "reward_std": 0.23201940953731537, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09241779148578644, "step": 3116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 125.296875, "completions/mean_terminated_length": 125.296875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 3.8004873591227537, "grad_norm": 0.8592846393585205, "kl": 1.826171875, "learning_rate": 3.308693936411421e-06, "loss": 0.0757, "num_tokens": 55099942.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 113.390625, "completions/mean_terminated_length": 113.390625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.8017057569296373, "grad_norm": 1.5557265281677246, "kl": 2.83984375, "learning_rate": 3.3023693749372597e-06, "loss": 0.0225, "num_tokens": 55113815.0, "reward": 1.85546875, "reward_std": 0.3624746799468994, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 3118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 127.609375, "completions/mean_terminated_length": 127.609375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.8029241547365213, "grad_norm": 1.6255065202713013, "kl": 2.421875, "learning_rate": 3.2960496682277308e-06, "loss": 0.1043, "num_tokens": 55129358.0, "reward": 1.94921875, "reward_std": 0.09439879655838013, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1118449866771698, "step": 3119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 120.234375, "completions/mean_terminated_length": 120.234375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 3.8041425525434054, "grad_norm": 1.9882680177688599, "kl": 4.189453125, "learning_rate": 3.2897348208636616e-06, "loss": 0.2542, "num_tokens": 55143773.0, "reward": 1.85546875, "reward_std": 0.3095911741256714, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 3120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 107.40625, "completions/mean_terminated_length": 107.40625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 3.8053609503502894, "grad_norm": 1.2806950807571411, "kl": 1.59375, "learning_rate": 3.2834248374223556e-06, "loss": 0.0637, "num_tokens": 55157239.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 119.265625, "completions/mean_terminated_length": 119.265625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.8065793481571735, "grad_norm": 1.1149553060531616, "kl": 2.515625, "learning_rate": 3.2771197224775954e-06, "loss": 0.1205, "num_tokens": 55171712.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1118449866771698, "step": 3122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 113.515625, "completions/mean_terminated_length": 113.515625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.8077977459640575, "grad_norm": 1.1833397150039673, "kl": 2.0048828125, "learning_rate": 3.2708194805996252e-06, "loss": 0.0615, "num_tokens": 55185513.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 135.625, "completions/mean_terminated_length": 135.625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 3.809016143770941, "grad_norm": 1.9026809930801392, "kl": 4.17578125, "learning_rate": 3.2645241163551678e-06, "loss": 0.201, "num_tokens": 55202577.0, "reward": 1.92578125, "reward_std": 0.12579017877578735, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 3124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 121.46875, "completions/mean_terminated_length": 121.46875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.810234541577825, "grad_norm": 1.2738858461380005, "kl": 2.080078125, "learning_rate": 3.2582336343074037e-06, "loss": 0.0212, "num_tokens": 55217647.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 3125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 119.65625, "completions/mean_terminated_length": 119.65625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.811452939384709, "grad_norm": 1.2214066982269287, "kl": 1.306640625, "learning_rate": 3.2519480390159806e-06, "loss": 0.0281, "num_tokens": 55232897.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 114.359375, "completions/mean_terminated_length": 114.359375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.812671337191593, "grad_norm": 3.4961860179901123, "kl": 3.30078125, "learning_rate": 3.245667335036995e-06, "loss": 0.1355, "num_tokens": 55247416.0, "reward": 1.90625, "reward_std": 0.2236309051513672, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 3127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 112.734375, "completions/mean_terminated_length": 112.734375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 3.813889734998477, "grad_norm": 1.740178108215332, "kl": 1.10546875, "learning_rate": 3.239391526923005e-06, "loss": -0.002, "num_tokens": 55261191.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 111.328125, "completions/mean_terminated_length": 111.328125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.815108132805361, "grad_norm": 0.6391153931617737, "kl": 1.0703125, "learning_rate": 3.233120619223021e-06, "loss": 0.038, "num_tokens": 55275020.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 126.640625, "completions/mean_terminated_length": 126.640625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.816326530612245, "grad_norm": 0.975274920463562, "kl": 1.8203125, "learning_rate": 3.2268546164824933e-06, "loss": 0.1112, "num_tokens": 55290501.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 119.546875, "completions/mean_terminated_length": 119.546875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.817544928419129, "grad_norm": 0.5131574273109436, "kl": 0.7578125, "learning_rate": 3.220593523243324e-06, "loss": 0.0304, "num_tokens": 55305376.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 135.578125, "completions/mean_terminated_length": 135.578125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.818763326226013, "grad_norm": 0.4705113172531128, "kl": 0.734375, "learning_rate": 3.214337344043854e-06, "loss": 0.0159, "num_tokens": 55323165.0, "reward": 1.96875, "reward_std": 0.0765465572476387, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 3132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 110.296875, "completions/mean_terminated_length": 110.296875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 3.8199817240328966, "grad_norm": 2.4588539600372314, "kl": 1.5166015625, "learning_rate": 3.208086083418863e-06, "loss": 0.0644, "num_tokens": 55337000.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 3133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 110.65625, "completions/mean_terminated_length": 110.65625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.8212001218397806, "grad_norm": 0.3556246757507324, "kl": 0.7255859375, "learning_rate": 3.20183974589957e-06, "loss": 0.0291, "num_tokens": 55351098.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 104.328125, "completions/mean_terminated_length": 104.328125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 3.8224185196466647, "grad_norm": 0.6283026933670044, "kl": 0.8994140625, "learning_rate": 3.1955983360136155e-06, "loss": 0.0329, "num_tokens": 55364231.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 100.203125, "completions/mean_terminated_length": 100.203125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.8236369174535487, "grad_norm": 0.9768409729003906, "kl": 1.1953125, "learning_rate": 3.1893618582850705e-06, "loss": 0.0371, "num_tokens": 55377300.0, "reward": 1.94921875, "reward_std": 0.0973096489906311, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 124.671875, "completions/mean_terminated_length": 124.671875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.8248553152604323, "grad_norm": 0.40392976999282837, "kl": 0.74609375, "learning_rate": 3.1831303172344386e-06, "loss": 0.0253, "num_tokens": 55392479.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 109.171875, "completions/mean_terminated_length": 109.171875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.8260737130673164, "grad_norm": 1.8139457702636719, "kl": 0.900390625, "learning_rate": 3.1769037173786376e-06, "loss": 0.0362, "num_tokens": 55406490.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1550549566745758, "step": 3138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 111.03125, "completions/mean_terminated_length": 111.03125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 3.8272921108742004, "grad_norm": 1.5143656730651855, "kl": 0.94140625, "learning_rate": 3.17068206323101e-06, "loss": 0.0004, "num_tokens": 55420932.0, "reward": 1.90234375, "reward_std": 0.2762135863304138, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 3139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 112.828125, "completions/mean_terminated_length": 112.828125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.8285105086810844, "grad_norm": 2.043498992919922, "kl": 1.275390625, "learning_rate": 3.1644653593013063e-06, "loss": 0.0964, "num_tokens": 55435121.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 129.078125, "completions/mean_terminated_length": 129.078125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.8297289064879685, "grad_norm": 1.9211702346801758, "kl": 1.76025390625, "learning_rate": 3.1582536100956973e-06, "loss": 0.1385, "num_tokens": 55451206.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 94.71875, "completions/mean_terminated_length": 94.71875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 3.8309473042948525, "grad_norm": 1.329574704170227, "kl": 2.46923828125, "learning_rate": 3.152046820116751e-06, "loss": 0.1487, "num_tokens": 55463580.0, "reward": 1.9375, "reward_std": 0.13862094283103943, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 3142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 102.5625, "completions/mean_terminated_length": 102.5625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 3.832165702101736, "grad_norm": 0.9520961046218872, "kl": 1.076171875, "learning_rate": 3.1458449938634494e-06, "loss": 0.0171, "num_tokens": 55477136.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 95.390625, "completions/mean_terminated_length": 95.390625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 3.83338409990862, "grad_norm": 0.6136399507522583, "kl": 0.64453125, "learning_rate": 3.1396481358311792e-06, "loss": 0.021, "num_tokens": 55489817.0, "reward": 1.99609375, "reward_std": 0.011048543266952038, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 102.125, "completions/mean_terminated_length": 102.125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.834602497715504, "grad_norm": 0.7507253289222717, "kl": 1.1083984375, "learning_rate": 3.1334562505117126e-06, "loss": 0.0354, "num_tokens": 55503017.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1118449866771698, "step": 3145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 96.46875, "completions/mean_terminated_length": 96.46875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.835820895522388, "grad_norm": 0.7277844548225403, "kl": 0.58349609375, "learning_rate": 3.127269342393231e-06, "loss": 0.0128, "num_tokens": 55515799.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 91.5625, "completions/mean_terminated_length": 91.5625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.837039293329272, "grad_norm": 0.17819543182849884, "kl": 0.578125, "learning_rate": 3.1210874159603044e-06, "loss": 0.0231, "num_tokens": 55527827.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 103.1875, "completions/mean_terminated_length": 103.1875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.838257691136156, "grad_norm": 1.128683090209961, "kl": 1.728515625, "learning_rate": 3.1149104756938843e-06, "loss": 0.0829, "num_tokens": 55541319.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 105.359375, "completions/mean_terminated_length": 105.359375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.83947608894304, "grad_norm": 1.8421443700790405, "kl": 2.150390625, "learning_rate": 3.1087385260713166e-06, "loss": 0.0799, "num_tokens": 55555046.0, "reward": 1.85546875, "reward_std": 0.32572782039642334, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10789459943771362, "step": 3149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 108.640625, "completions/mean_terminated_length": 108.640625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.840694486749924, "grad_norm": 1.813378930091858, "kl": 2.7265625, "learning_rate": 3.1025715715663284e-06, "loss": 0.2222, "num_tokens": 55568847.0, "reward": 1.9140625, "reward_std": 0.24306795001029968, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 3150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 94.484375, "completions/mean_terminated_length": 94.484375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.841912884556808, "grad_norm": 0.672375500202179, "kl": 0.724609375, "learning_rate": 3.096409616649023e-06, "loss": 0.0077, "num_tokens": 55581670.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 103.9375, "completions/mean_terminated_length": 103.9375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 3.8431312823636916, "grad_norm": 1.3586934804916382, "kl": 3.033203125, "learning_rate": 3.0902526657858835e-06, "loss": 0.1753, "num_tokens": 55594946.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 101.40625, "completions/mean_terminated_length": 101.40625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 3.8443496801705757, "grad_norm": 0.542064905166626, "kl": 0.64013671875, "learning_rate": 3.0841007234397655e-06, "loss": 0.0256, "num_tokens": 55608372.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 90.09375, "completions/mean_terminated_length": 90.09375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 3.8455680779774597, "grad_norm": 0.6772673726081848, "kl": 0.7841796875, "learning_rate": 3.077953794069891e-06, "loss": 0.0003, "num_tokens": 55620498.0, "reward": 1.953125, "reward_std": 0.09300297498703003, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 92.1875, "completions/mean_terminated_length": 92.1875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.8467864757843437, "grad_norm": 1.1526594161987305, "kl": 2.1953125, "learning_rate": 3.071811882131844e-06, "loss": 0.0904, "num_tokens": 55633174.0, "reward": 1.890625, "reward_std": 0.23162393271923065, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 3155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 112.78125, "completions/mean_terminated_length": 112.78125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 3.8480048735912273, "grad_norm": 0.7409985065460205, "kl": 0.66943359375, "learning_rate": 3.065674992077584e-06, "loss": 0.0291, "num_tokens": 55648040.0, "reward": 1.97265625, "reward_std": 0.07733980566263199, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 3156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 98.875, "completions/mean_terminated_length": 98.875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 3.8492232713981114, "grad_norm": 1.3688238859176636, "kl": 1.953125, "learning_rate": 3.0595431283554212e-06, "loss": 0.0868, "num_tokens": 55661632.0, "reward": 1.87890625, "reward_std": 0.24209868907928467, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 3157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 96.21875, "completions/mean_terminated_length": 96.21875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 3.8504416692049954, "grad_norm": 1.4741400480270386, "kl": 1.4384765625, "learning_rate": 3.0534162954100264e-06, "loss": 0.0099, "num_tokens": 55674550.0, "reward": 1.8984375, "reward_std": 0.22621294856071472, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.17938801646232605, "step": 3158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 84.515625, "completions/mean_terminated_length": 84.515625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.8516600670118795, "grad_norm": 1.572265625, "kl": 3.013671875, "learning_rate": 3.0472944976824225e-06, "loss": 0.1549, "num_tokens": 55686407.0, "reward": 1.89453125, "reward_std": 0.29831069707870483, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18662993609905243, "step": 3159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 101.90625, "completions/mean_terminated_length": 101.90625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 3.8528784648187635, "grad_norm": 1.0719329118728638, "kl": 1.93408203125, "learning_rate": 3.041177739609986e-06, "loss": 0.087, "num_tokens": 55700361.0, "reward": 1.91015625, "reward_std": 0.1972038745880127, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.19697457551956177, "step": 3160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 90.390625, "completions/mean_terminated_length": 90.390625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.8540968626256475, "grad_norm": 1.4320241212844849, "kl": 1.8525390625, "learning_rate": 3.035066025626434e-06, "loss": 0.0379, "num_tokens": 55712754.0, "reward": 1.8671875, "reward_std": 0.25807058811187744, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.20638974010944366, "step": 3161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 123.59375, "completions/mean_terminated_length": 123.59375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.855315260432531, "grad_norm": 1.153490424156189, "kl": 2.90283203125, "learning_rate": 3.0289593601618274e-06, "loss": 0.2244, "num_tokens": 55728416.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08768405020236969, "step": 3162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 91.09375, "completions/mean_terminated_length": 91.09375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 3.856533658239415, "grad_norm": 0.21335946023464203, "kl": 0.4619140625, "learning_rate": 3.0228577476425736e-06, "loss": 0.0185, "num_tokens": 55740934.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 88.390625, "completions/mean_terminated_length": 88.390625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.8577520560462992, "grad_norm": 0.24805952608585358, "kl": 0.50146484375, "learning_rate": 3.0167611924914154e-06, "loss": 0.0201, "num_tokens": 55753167.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 84.984375, "completions/mean_terminated_length": 84.984375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 3.858970453853183, "grad_norm": 1.1042753458023071, "kl": 1.306640625, "learning_rate": 3.0106696991274278e-06, "loss": -0.011, "num_tokens": 55765590.0, "reward": 1.875, "reward_std": 0.2177756428718567, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.24397502839565277, "step": 3165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 79.28125, "completions/mean_terminated_length": 79.28125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 3.860188851660067, "grad_norm": 0.7605605721473694, "kl": 0.84765625, "learning_rate": 3.004583271966023e-06, "loss": 0.0297, "num_tokens": 55777256.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 114.453125, "completions/mean_terminated_length": 114.453125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 3.861407249466951, "grad_norm": 3.624054193496704, "kl": 4.20263671875, "learning_rate": 2.9985019154189277e-06, "loss": 0.2777, "num_tokens": 55792269.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 83.546875, "completions/mean_terminated_length": 83.546875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.862625647273835, "grad_norm": 0.7403936982154846, "kl": 0.8876953125, "learning_rate": 2.9924256338942072e-06, "loss": 0.0084, "num_tokens": 55804520.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 80.8125, "completions/mean_terminated_length": 80.8125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 3.863844045080719, "grad_norm": 0.3373754322528839, "kl": 0.4794921875, "learning_rate": 2.986354431796239e-06, "loss": 0.0192, "num_tokens": 55816140.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 76.609375, "completions/mean_terminated_length": 76.609375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.865062442887603, "grad_norm": 0.5843992233276367, "kl": 0.99365234375, "learning_rate": 2.980288313525728e-06, "loss": 0.0232, "num_tokens": 55827891.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 101.125, "completions/mean_terminated_length": 101.125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 3.8662808406944866, "grad_norm": 0.7530199885368347, "kl": 1.3310546875, "learning_rate": 2.9742272834796813e-06, "loss": 0.0634, "num_tokens": 55841771.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 97.921875, "completions/mean_terminated_length": 97.921875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.8674992385013707, "grad_norm": 0.40079474449157715, "kl": 0.46337890625, "learning_rate": 2.9681713460514283e-06, "loss": -0.0017, "num_tokens": 55855486.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 124.875, "completions/mean_terminated_length": 124.875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.8687176363082547, "grad_norm": 2.7779064178466797, "kl": 1.419921875, "learning_rate": 2.9621205056306056e-06, "loss": 0.071, "num_tokens": 55872822.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 90.71875, "completions/mean_terminated_length": 90.71875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.8699360341151388, "grad_norm": 1.7091089487075806, "kl": 3.0078125, "learning_rate": 2.9560747666031477e-06, "loss": 0.2016, "num_tokens": 55885548.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 3174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 83.734375, "completions/mean_terminated_length": 83.734375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.8711544319220224, "grad_norm": 9.587554931640625, "kl": 0.8515625, "learning_rate": 2.9500341333513003e-06, "loss": 0.0066, "num_tokens": 55897555.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 107.984375, "completions/mean_terminated_length": 107.984375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.8723728297289064, "grad_norm": 1.8587672710418701, "kl": 2.9775390625, "learning_rate": 2.9439986102536046e-06, "loss": 0.0964, "num_tokens": 55912306.0, "reward": 1.85546875, "reward_std": 0.34774690866470337, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.23671942949295044, "step": 3176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 101.765625, "completions/mean_terminated_length": 101.765625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.8735912275357904, "grad_norm": 1.9449645280838013, "kl": 5.67236328125, "learning_rate": 2.9379682016848975e-06, "loss": 0.4342, "num_tokens": 55925691.0, "reward": 1.9296875, "reward_std": 0.19887377321720123, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 79.625, "completions/mean_terminated_length": 79.625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 3.8748096253426745, "grad_norm": 0.6090725660324097, "kl": 0.845703125, "learning_rate": 2.9319429120163114e-06, "loss": -0.0332, "num_tokens": 55937035.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 83.5, "completions/mean_terminated_length": 83.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 3.8760280231495585, "grad_norm": 2.4220852851867676, "kl": 3.533203125, "learning_rate": 2.9259227456152615e-06, "loss": 0.109, "num_tokens": 55949075.0, "reward": 1.84375, "reward_std": 0.3779512941837311, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.22271771728992462, "step": 3179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 91.0, "completions/mean_terminated_length": 91.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.877246420956442, "grad_norm": 1.5989118814468384, "kl": 1.416015625, "learning_rate": 2.9199077068454583e-06, "loss": 0.0565, "num_tokens": 55961603.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 124.84375, "completions/mean_terminated_length": 124.84375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 3.878464818763326, "grad_norm": 2.514558792114258, "kl": 5.970703125, "learning_rate": 2.913897800066887e-06, "loss": 0.3445, "num_tokens": 55978321.0, "reward": 1.82421875, "reward_std": 0.4083726406097412, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21931616961956024, "step": 3181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 111.75, "completions/mean_terminated_length": 111.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 3.87968321657021, "grad_norm": 1.6658309698104858, "kl": 2.08544921875, "learning_rate": 2.907893029635818e-06, "loss": 0.0449, "num_tokens": 55993137.0, "reward": 1.91796875, "reward_std": 0.23201942443847656, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 116.515625, "completions/mean_terminated_length": 102.11111450195312, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.8809016143770942, "grad_norm": 4.276834487915039, "kl": 9.240234375, "learning_rate": 2.901893399904797e-06, "loss": 0.6393, "num_tokens": 56007498.0, "reward": 1.88671875, "reward_std": 0.2325522005558014, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 3183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 110.171875, "completions/mean_terminated_length": 110.171875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.882120012183978, "grad_norm": 1.3574150800704956, "kl": 2.19775390625, "learning_rate": 2.895898915222647e-06, "loss": 0.0753, "num_tokens": 56022117.0, "reward": 1.89453125, "reward_std": 0.2587311267852783, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 3184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 83.515625, "completions/mean_terminated_length": 83.515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 3.883338409990862, "grad_norm": 1.5122638940811157, "kl": 3.72216796875, "learning_rate": 2.889909579934458e-06, "loss": 0.1739, "num_tokens": 56033662.0, "reward": 1.85546875, "reward_std": 0.3624746799468994, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18992316722869873, "step": 3185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 107.71875, "completions/mean_terminated_length": 107.71875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.884556807797746, "grad_norm": 1.4786897897720337, "kl": 1.51513671875, "learning_rate": 2.883925398381585e-06, "loss": 0.0655, "num_tokens": 56048004.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 103.515625, "completions/mean_terminated_length": 103.515625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.88577520560463, "grad_norm": 0.8866425156593323, "kl": 1.20361328125, "learning_rate": 2.8779463749016525e-06, "loss": 0.0277, "num_tokens": 56062085.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 107.015625, "completions/mean_terminated_length": 107.015625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 3.886993603411514, "grad_norm": 1.7793182134628296, "kl": 7.65625, "learning_rate": 2.8719725138285472e-06, "loss": 0.5305, "num_tokens": 56075542.0, "reward": 1.83984375, "reward_std": 0.40666887164115906, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.23249077796936035, "step": 3188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 91.703125, "completions/mean_terminated_length": 91.703125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 3.888212001218398, "grad_norm": 1.2642605304718018, "kl": 2.24755859375, "learning_rate": 2.8660038194924045e-06, "loss": 0.0444, "num_tokens": 56088179.0, "reward": 1.86328125, "reward_std": 0.32564982771873474, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.2215663492679596, "step": 3189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 101.015625, "completions/mean_terminated_length": 101.015625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.8894303990252816, "grad_norm": 0.9941763281822205, "kl": 1.7373046875, "learning_rate": 2.860040296219624e-06, "loss": 0.0974, "num_tokens": 56101500.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 96.8125, "completions/mean_terminated_length": 96.8125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.8906487968321657, "grad_norm": 0.44122210144996643, "kl": 0.80126953125, "learning_rate": 2.854081948332854e-06, "loss": 0.0081, "num_tokens": 56114632.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 115.640625, "completions/mean_terminated_length": 115.640625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.8918671946390497, "grad_norm": 1.3136874437332153, "kl": 2.283203125, "learning_rate": 2.8481287801509947e-06, "loss": 0.0848, "num_tokens": 56129289.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 113.484375, "completions/mean_terminated_length": 113.484375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.8930855924459333, "grad_norm": 1.3921455144882202, "kl": 2.04150390625, "learning_rate": 2.8421807959891824e-06, "loss": 0.0581, "num_tokens": 56144032.0, "reward": 1.89453125, "reward_std": 0.2519892454147339, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 3193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 97.625, "completions/mean_terminated_length": 97.625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.8943039902528174, "grad_norm": 1.2641814947128296, "kl": 0.72216796875, "learning_rate": 2.8362380001588054e-06, "loss": 0.0243, "num_tokens": 56156784.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 99.375, "completions/mean_terminated_length": 99.375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.8955223880597014, "grad_norm": 1.2268046140670776, "kl": 2.5283203125, "learning_rate": 2.830300396967487e-06, "loss": 0.1419, "num_tokens": 56169776.0, "reward": 1.90625, "reward_std": 0.21593277156352997, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1298656165599823, "step": 3195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 115.59375, "completions/mean_terminated_length": 115.59375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 3.8967407858665855, "grad_norm": 0.9424760341644287, "kl": 1.86962890625, "learning_rate": 2.824367990719088e-06, "loss": 0.0642, "num_tokens": 56184294.0, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 116.5, "completions/mean_terminated_length": 116.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.8979591836734695, "grad_norm": 1.9279075860977173, "kl": 2.37646484375, "learning_rate": 2.818440785713704e-06, "loss": 0.1183, "num_tokens": 56198966.0, "reward": 1.91015625, "reward_std": 0.20206692814826965, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 3197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 123.828125, "completions/mean_terminated_length": 123.828125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.8991775814803535, "grad_norm": 0.895325243473053, "kl": 1.57421875, "learning_rate": 2.8125187862476567e-06, "loss": 0.0742, "num_tokens": 56214651.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 106.3125, "completions/mean_terminated_length": 106.3125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 3.900395979287237, "grad_norm": 0.13045474886894226, "kl": 0.52197265625, "learning_rate": 2.8066019966134907e-06, "loss": 0.0209, "num_tokens": 56228599.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 97.71875, "completions/mean_terminated_length": 97.71875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 3.901614377094121, "grad_norm": 0.9341543912887573, "kl": 1.4462890625, "learning_rate": 2.800690421099984e-06, "loss": 0.0079, "num_tokens": 56241709.0, "reward": 1.8828125, "reward_std": 0.2851349115371704, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.17938801646232605, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 105.90625, "completions/mean_terminated_length": 105.90625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.902832774901005, "grad_norm": 0.823971152305603, "kl": 0.81982421875, "learning_rate": 2.7947840639921308e-06, "loss": 0.0331, "num_tokens": 56255759.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 107.765625, "completions/mean_terminated_length": 107.765625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 3.9040511727078893, "grad_norm": 1.1463373899459839, "kl": 0.779296875, "learning_rate": 2.7888829295711415e-06, "loss": 0.0488, "num_tokens": 56269184.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 111.828125, "completions/mean_terminated_length": 111.828125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 3.905269570514773, "grad_norm": 0.6823846101760864, "kl": 0.7587890625, "learning_rate": 2.7829870221144426e-06, "loss": 0.0357, "num_tokens": 56283429.0, "reward": 1.9609375, "reward_std": 0.07232969254255295, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 111.390625, "completions/mean_terminated_length": 111.390625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.906487968321657, "grad_norm": 0.16144955158233643, "kl": 0.4912109375, "learning_rate": 2.7770963458956758e-06, "loss": 0.0197, "num_tokens": 56297966.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 111.203125, "completions/mean_terminated_length": 111.203125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.907706366128541, "grad_norm": 0.6558759808540344, "kl": 1.962890625, "learning_rate": 2.77121090518468e-06, "loss": 0.1194, "num_tokens": 56312099.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 102.90625, "completions/mean_terminated_length": 102.90625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.908924763935425, "grad_norm": 0.743125319480896, "kl": 1.60546875, "learning_rate": 2.765330704247505e-06, "loss": 0.0733, "num_tokens": 56325533.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 106.609375, "completions/mean_terminated_length": 106.609375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 3.910143161742309, "grad_norm": 1.4552814960479736, "kl": 1.7705078125, "learning_rate": 2.759455747346407e-06, "loss": 0.0821, "num_tokens": 56338972.0, "reward": 1.93359375, "reward_std": 0.18782523274421692, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 121.359375, "completions/mean_terminated_length": 121.359375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.911361559549193, "grad_norm": 1.779808521270752, "kl": 2.66845703125, "learning_rate": 2.753586038739834e-06, "loss": 0.2359, "num_tokens": 56354035.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 103.484375, "completions/mean_terminated_length": 103.484375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 3.9125799573560767, "grad_norm": 0.4507041573524475, "kl": 0.59326171875, "learning_rate": 2.7477215826824355e-06, "loss": -0.0057, "num_tokens": 56367314.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 127.875, "completions/mean_terminated_length": 127.875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.9137983551629607, "grad_norm": 1.8424073457717896, "kl": 3.59423828125, "learning_rate": 2.74186238342505e-06, "loss": 0.1773, "num_tokens": 56383578.0, "reward": 1.90625, "reward_std": 0.22097086906433105, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 3210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 120.0625, "completions/mean_terminated_length": 120.0625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 3.9150167529698447, "grad_norm": 0.7430741190910339, "kl": 0.82080078125, "learning_rate": 2.7360084452147108e-06, "loss": 0.0287, "num_tokens": 56398534.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 113.640625, "completions/mean_terminated_length": 113.640625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 3.9162351507767283, "grad_norm": 2.259256362915039, "kl": 1.3984375, "learning_rate": 2.730159772294627e-06, "loss": 0.0866, "num_tokens": 56413159.0, "reward": 1.91015625, "reward_std": 0.21258234977722168, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 3212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 115.515625, "completions/mean_terminated_length": 115.515625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.9174535485836124, "grad_norm": 1.0133934020996094, "kl": 0.7451171875, "learning_rate": 2.724316368904201e-06, "loss": 0.062, "num_tokens": 56427808.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 107.40625, "completions/mean_terminated_length": 107.40625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 3.9186719463904964, "grad_norm": 1.2114897966384888, "kl": 2.466796875, "learning_rate": 2.718478239279014e-06, "loss": 0.0875, "num_tokens": 56441602.0, "reward": 1.890625, "reward_std": 0.30935919284820557, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 3214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 98.3125, "completions/mean_terminated_length": 98.3125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.9198903441973805, "grad_norm": 1.1702775955200195, "kl": 2.19091796875, "learning_rate": 2.7126453876508195e-06, "loss": 0.1661, "num_tokens": 56454278.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 103.9375, "completions/mean_terminated_length": 103.9375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.9211087420042645, "grad_norm": 0.6543660163879395, "kl": 0.755859375, "learning_rate": 2.7068178182475514e-06, "loss": 0.041, "num_tokens": 56467666.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 98.484375, "completions/mean_terminated_length": 98.484375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.9223271398111486, "grad_norm": 1.1191301345825195, "kl": 2.48583984375, "learning_rate": 2.7009955352933127e-06, "loss": 0.1491, "num_tokens": 56480433.0, "reward": 1.9609375, "reward_std": 0.07232969254255295, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 111.75, "completions/mean_terminated_length": 111.75, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.923545537618032, "grad_norm": 1.246230959892273, "kl": 1.7744140625, "learning_rate": 2.695178543008371e-06, "loss": 0.0981, "num_tokens": 56494497.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 96.296875, "completions/mean_terminated_length": 96.296875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 3.924763935424916, "grad_norm": 0.6812640428543091, "kl": 0.81884765625, "learning_rate": 2.6893668456091627e-06, "loss": 0.0093, "num_tokens": 56507716.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 112.03125, "completions/mean_terminated_length": 112.03125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.9259823332318002, "grad_norm": 1.4534835815429688, "kl": 3.23828125, "learning_rate": 2.6835604473082868e-06, "loss": 0.2016, "num_tokens": 56522486.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 100.84375, "completions/mean_terminated_length": 100.84375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 3.9272007310386843, "grad_norm": 0.5113069415092468, "kl": 0.77490234375, "learning_rate": 2.6777593523144986e-06, "loss": 0.0157, "num_tokens": 56535884.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 114.34375, "completions/mean_terminated_length": 114.34375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 3.928419128845568, "grad_norm": 0.3507748544216156, "kl": 0.5166015625, "learning_rate": 2.6719635648327125e-06, "loss": 0.0207, "num_tokens": 56550594.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 111.984375, "completions/mean_terminated_length": 111.984375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 3.929637526652452, "grad_norm": 0.3985324203968048, "kl": 0.5791015625, "learning_rate": 2.6661730890639947e-06, "loss": -0.0019, "num_tokens": 56565121.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 112.328125, "completions/mean_terminated_length": 112.328125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.930855924459336, "grad_norm": 0.21484175324440002, "kl": 0.4580078125, "learning_rate": 2.660387929205559e-06, "loss": 0.0184, "num_tokens": 56579478.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 106.59375, "completions/mean_terminated_length": 106.59375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 3.93207432226622, "grad_norm": 1.5929789543151855, "kl": 2.57080078125, "learning_rate": 2.6546080894507642e-06, "loss": 0.1721, "num_tokens": 56592916.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 3225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 105.640625, "completions/mean_terminated_length": 105.640625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.933292720073104, "grad_norm": 2.0591304302215576, "kl": 3.9580078125, "learning_rate": 2.648833573989118e-06, "loss": 0.2366, "num_tokens": 56606589.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1550549566745758, "step": 3226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 95.859375, "completions/mean_terminated_length": 95.859375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.934511117879988, "grad_norm": 0.5568116307258606, "kl": 0.7001953125, "learning_rate": 2.643064387006268e-06, "loss": 0.0369, "num_tokens": 56619484.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 96.0, "completions/mean_terminated_length": 96.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 3.9357295156868717, "grad_norm": 0.6049745678901672, "kl": 0.98388671875, "learning_rate": 2.6373005326839973e-06, "loss": 0.0025, "num_tokens": 56632812.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 116.9375, "completions/mean_terminated_length": 116.9375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 3.9369479134937557, "grad_norm": 0.8974380493164062, "kl": 1.32861328125, "learning_rate": 2.631542015200228e-06, "loss": 0.0865, "num_tokens": 56647472.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 98.09375, "completions/mean_terminated_length": 98.09375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 3.9381663113006398, "grad_norm": 1.398660659790039, "kl": 2.35986328125, "learning_rate": 2.6257888387290043e-06, "loss": 0.1324, "num_tokens": 56660726.0, "reward": 1.92578125, "reward_std": 0.20992231369018555, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 3230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 110.40625, "completions/mean_terminated_length": 110.40625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 3.9393847091075234, "grad_norm": 0.4325087070465088, "kl": 0.9609375, "learning_rate": 2.620041007440508e-06, "loss": 0.0048, "num_tokens": 56675064.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 90.796875, "completions/mean_terminated_length": 90.796875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.9406031069144074, "grad_norm": 0.14959405362606049, "kl": 0.42333984375, "learning_rate": 2.614298525501047e-06, "loss": 0.017, "num_tokens": 56687403.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 111.5, "completions/mean_terminated_length": 111.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.9418215047212914, "grad_norm": 1.4373210668563843, "kl": 2.279296875, "learning_rate": 2.6085613970730404e-06, "loss": 0.1394, "num_tokens": 56702099.0, "reward": 1.90234375, "reward_std": 0.24652892351150513, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13264097273349762, "step": 3233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 112.484375, "completions/mean_terminated_length": 112.484375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 3.9430399025281755, "grad_norm": 1.0176525115966797, "kl": 1.94482421875, "learning_rate": 2.602829626315041e-06, "loss": 0.1079, "num_tokens": 56717066.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 97.65625, "completions/mean_terminated_length": 97.65625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.9442583003350595, "grad_norm": 0.22130563855171204, "kl": 0.42431640625, "learning_rate": 2.597103217381709e-06, "loss": 0.017, "num_tokens": 56730348.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 109.078125, "completions/mean_terminated_length": 109.078125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 3.9454766981419436, "grad_norm": 1.37908935546875, "kl": 3.51123046875, "learning_rate": 2.591382174423824e-06, "loss": 0.183, "num_tokens": 56744657.0, "reward": 1.9140625, "reward_std": 0.18201877176761627, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 3236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 93.734375, "completions/mean_terminated_length": 93.734375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.946695095948827, "grad_norm": 0.5686092376708984, "kl": 0.6943359375, "learning_rate": 2.5856665015882686e-06, "loss": 0.0234, "num_tokens": 56757296.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 100.9375, "completions/mean_terminated_length": 100.9375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.947913493755711, "grad_norm": 0.17361058294773102, "kl": 0.40966796875, "learning_rate": 2.5799562030180404e-06, "loss": 0.0164, "num_tokens": 56771188.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 85.828125, "completions/mean_terminated_length": 85.828125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 3.9491318915625953, "grad_norm": 1.0611646175384521, "kl": 1.39794921875, "learning_rate": 2.574251282852236e-06, "loss": 0.0356, "num_tokens": 56783017.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 120.171875, "completions/mean_terminated_length": 120.171875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.9503502893694793, "grad_norm": 0.6300775408744812, "kl": 1.03955078125, "learning_rate": 2.5685517452260566e-06, "loss": 0.0616, "num_tokens": 56799116.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 102.046875, "completions/mean_terminated_length": 102.046875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.951568687176363, "grad_norm": 0.8854254484176636, "kl": 1.5009765625, "learning_rate": 2.5628575942708047e-06, "loss": 0.0824, "num_tokens": 56812495.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 91.609375, "completions/mean_terminated_length": 91.609375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 3.952787084983247, "grad_norm": 1.5593849420547485, "kl": 3.4541015625, "learning_rate": 2.557168834113869e-06, "loss": 0.1987, "num_tokens": 56825198.0, "reward": 1.89453125, "reward_std": 0.2519892454147339, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 3242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 95.6875, "completions/mean_terminated_length": 95.6875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 3.954005482790131, "grad_norm": 0.7018871307373047, "kl": 0.8115234375, "learning_rate": 2.5514854688787406e-06, "loss": 0.0068, "num_tokens": 56837858.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 94.5625, "completions/mean_terminated_length": 94.5625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 3.955223880597015, "grad_norm": 0.5881611704826355, "kl": 0.50537109375, "learning_rate": 2.5458075026849915e-06, "loss": 0.0135, "num_tokens": 56850790.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 107.59375, "completions/mean_terminated_length": 107.59375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 3.956442278403899, "grad_norm": 0.43831580877304077, "kl": 0.5595703125, "learning_rate": 2.5401349396482867e-06, "loss": 0.0077, "num_tokens": 56865292.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 94.28125, "completions/mean_terminated_length": 94.28125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.9576606762107827, "grad_norm": 1.2065566778182983, "kl": 2.1767578125, "learning_rate": 2.534467783880373e-06, "loss": 0.16, "num_tokens": 56877966.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 3246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 104.140625, "completions/mean_terminated_length": 104.140625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 3.9588790740176667, "grad_norm": 0.282025545835495, "kl": 0.482421875, "learning_rate": 2.528806039489078e-06, "loss": 0.0193, "num_tokens": 56891607.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 87.828125, "completions/mean_terminated_length": 87.828125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 3.9600974718245507, "grad_norm": 1.3179465532302856, "kl": 2.22509765625, "learning_rate": 2.5231497105783077e-06, "loss": 0.1111, "num_tokens": 56903796.0, "reward": 1.8984375, "reward_std": 0.2352125644683838, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 3248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 94.125, "completions/mean_terminated_length": 94.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.961315869631435, "grad_norm": 1.298208475112915, "kl": 1.85595703125, "learning_rate": 2.517498801248035e-06, "loss": 0.0644, "num_tokens": 56916260.0, "reward": 1.9296875, "reward_std": 0.19887377321720123, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 107.296875, "completions/mean_terminated_length": 107.296875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.9625342674383184, "grad_norm": 0.37393641471862793, "kl": 0.43701171875, "learning_rate": 2.511853315594318e-06, "loss": 0.0174, "num_tokens": 56930639.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 107.390625, "completions/mean_terminated_length": 107.390625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 3.9637526652452024, "grad_norm": 3.296877145767212, "kl": 5.1884765625, "learning_rate": 2.50621325770927e-06, "loss": 0.2542, "num_tokens": 56944168.0, "reward": 1.890625, "reward_std": 0.18645437061786652, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.19352105259895325, "step": 3251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 100.078125, "completions/mean_terminated_length": 100.078125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 3.9649710630520865, "grad_norm": 1.1620314121246338, "kl": 1.8232421875, "learning_rate": 2.5005786316810775e-06, "loss": 0.017, "num_tokens": 56957341.0, "reward": 1.8984375, "reward_std": 0.28726211190223694, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 3252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 106.984375, "completions/mean_terminated_length": 106.984375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.9661894608589705, "grad_norm": 0.9804143309593201, "kl": 2.29541015625, "learning_rate": 2.49494944159399e-06, "loss": 0.1179, "num_tokens": 56970748.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 91.5625, "completions/mean_terminated_length": 91.5625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.9674078586658545, "grad_norm": 0.8014360666275024, "kl": 1.10888671875, "learning_rate": 2.489325691528315e-06, "loss": 0.0141, "num_tokens": 56983360.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 97.109375, "completions/mean_terminated_length": 97.109375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.9686262564727386, "grad_norm": 0.3480142652988434, "kl": 0.45703125, "learning_rate": 2.4837073855604186e-06, "loss": 0.0183, "num_tokens": 56996135.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 94.515625, "completions/mean_terminated_length": 94.515625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.969844654279622, "grad_norm": 1.402114987373352, "kl": 0.98681640625, "learning_rate": 2.4780945277627144e-06, "loss": 0.0715, "num_tokens": 57008608.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 3256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 93.734375, "completions/mean_terminated_length": 93.734375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.9710630520865062, "grad_norm": 0.8526754379272461, "kl": 0.5859375, "learning_rate": 2.472487122203673e-06, "loss": 0.043, "num_tokens": 57020735.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 116.234375, "completions/mean_terminated_length": 116.234375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 3.9722814498933903, "grad_norm": 1.3755264282226562, "kl": 3.29150390625, "learning_rate": 2.466885172947816e-06, "loss": 0.2178, "num_tokens": 57035918.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 109.5, "completions/mean_terminated_length": 109.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.973499847700274, "grad_norm": 0.9396213889122009, "kl": 0.6630859375, "learning_rate": 2.4612886840556993e-06, "loss": 0.0291, "num_tokens": 57049942.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 109.21875, "completions/mean_terminated_length": 109.21875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 3.974718245507158, "grad_norm": 0.21090520918369293, "kl": 0.380859375, "learning_rate": 2.455697659583929e-06, "loss": 0.0152, "num_tokens": 57064228.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 112.1875, "completions/mean_terminated_length": 112.1875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 3.975936643314042, "grad_norm": 1.3635108470916748, "kl": 3.9140625, "learning_rate": 2.4501121035851494e-06, "loss": 0.14, "num_tokens": 57078728.0, "reward": 1.85546875, "reward_std": 0.40879613161087036, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.244957834482193, "step": 3261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 108.046875, "completions/mean_terminated_length": 108.046875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 3.977155041120926, "grad_norm": 0.8305751085281372, "kl": 1.96435546875, "learning_rate": 2.4445320201080426e-06, "loss": 0.1083, "num_tokens": 57092771.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 92.765625, "completions/mean_terminated_length": 92.765625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 3.97837343892781, "grad_norm": 1.147260308265686, "kl": 1.23828125, "learning_rate": 2.4389574131973158e-06, "loss": -0.0005, "num_tokens": 57105204.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 104.21875, "completions/mean_terminated_length": 104.21875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 3.979591836734694, "grad_norm": 0.9766493439674377, "kl": 1.81396484375, "learning_rate": 2.433388286893714e-06, "loss": 0.022, "num_tokens": 57118962.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 3264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 114.125, "completions/mean_terminated_length": 114.125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 3.9808102345415777, "grad_norm": 0.2230781614780426, "kl": 0.37109375, "learning_rate": 2.427824645234009e-06, "loss": 0.0148, "num_tokens": 57133458.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 113.296875, "completions/mean_terminated_length": 113.296875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 3.9820286323484617, "grad_norm": 0.20166830718517303, "kl": 0.42431640625, "learning_rate": 2.422266492250994e-06, "loss": 0.017, "num_tokens": 57148349.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 104.84375, "completions/mean_terminated_length": 104.84375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.9832470301553458, "grad_norm": 1.2056472301483154, "kl": 1.43798828125, "learning_rate": 2.4167138319734907e-06, "loss": 0.0313, "num_tokens": 57162147.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 118.546875, "completions/mean_terminated_length": 118.546875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 3.98446542796223, "grad_norm": 1.7236088514328003, "kl": 5.92578125, "learning_rate": 2.41116666842633e-06, "loss": 0.4296, "num_tokens": 57176934.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08768405020236969, "step": 3268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 100.296875, "completions/mean_terminated_length": 100.296875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 3.9856838257691134, "grad_norm": 0.9833700656890869, "kl": 0.794921875, "learning_rate": 2.405625005630359e-06, "loss": 0.0139, "num_tokens": 57190513.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 95.4375, "completions/mean_terminated_length": 95.4375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.9869022235759974, "grad_norm": 1.0875626802444458, "kl": 2.23291015625, "learning_rate": 2.4000888476024454e-06, "loss": 0.1415, "num_tokens": 57203205.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 107.484375, "completions/mean_terminated_length": 107.484375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 3.9881206213828815, "grad_norm": 0.8320860266685486, "kl": 1.2060546875, "learning_rate": 2.394558198355462e-06, "loss": -0.0164, "num_tokens": 57216940.0, "reward": 1.91796875, "reward_std": 0.23201942443847656, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 112.09375, "completions/mean_terminated_length": 112.09375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 3.9893390191897655, "grad_norm": 0.1739790141582489, "kl": 0.44775390625, "learning_rate": 2.3890330618982893e-06, "loss": 0.0179, "num_tokens": 57231338.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 119.015625, "completions/mean_terminated_length": 119.015625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 3.9905574169966496, "grad_norm": 1.5960522890090942, "kl": 4.5244140625, "learning_rate": 2.383513442235812e-06, "loss": 0.261, "num_tokens": 57246483.0, "reward": 1.91796875, "reward_std": 0.23201942443847656, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 127.140625, "completions/mean_terminated_length": 127.140625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.9917758148035336, "grad_norm": 2.2050435543060303, "kl": 6.5302734375, "learning_rate": 2.3779993433689186e-06, "loss": 0.4055, "num_tokens": 57262380.0, "reward": 1.875, "reward_std": 0.2177756428718567, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.24397502839565277, "step": 3274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 108.40625, "completions/mean_terminated_length": 108.40625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 3.992994212610417, "grad_norm": 1.0283383131027222, "kl": 2.435546875, "learning_rate": 2.3724907692944875e-06, "loss": 0.1177, "num_tokens": 57276790.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 100.125, "completions/mean_terminated_length": 100.125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 3.9942126104173012, "grad_norm": 1.5576951503753662, "kl": 3.0546875, "learning_rate": 2.366987724005404e-06, "loss": 0.0974, "num_tokens": 57290166.0, "reward": 1.87890625, "reward_std": 0.3425048589706421, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 3276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 104.609375, "completions/mean_terminated_length": 104.609375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 3.9954310082241853, "grad_norm": 1.0349440574645996, "kl": 0.98388671875, "learning_rate": 2.3614902114905324e-06, "loss": 0.0131, "num_tokens": 57303861.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 123.9375, "completions/mean_terminated_length": 123.9375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 3.996649406031069, "grad_norm": 0.8320758938789368, "kl": 1.18359375, "learning_rate": 2.355998235734739e-06, "loss": 0.0554, "num_tokens": 57319673.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 101.9375, "completions/mean_terminated_length": 101.9375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.997867803837953, "grad_norm": 2.0136687755584717, "kl": 1.724609375, "learning_rate": 2.350511800718871e-06, "loss": 0.0004, "num_tokens": 57332813.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 3279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 91.40541076660156, "completions/mean_terminated_length": 91.40541076660156, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 3.999086201644837, "grad_norm": 0.7622662782669067, "kl": 0.87548828125, "learning_rate": 2.345030910419762e-06, "loss": 0.0066, "num_tokens": 57344867.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 115.359375, "completions/mean_terminated_length": 115.359375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.001218397806884, "grad_norm": 0.8585883975028992, "kl": 0.91943359375, "learning_rate": 2.339555568810221e-06, "loss": -0.0114, "num_tokens": 57359834.0, "reward": 1.91796875, "reward_std": 0.17996984720230103, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.16587424278259277, "step": 3281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 109.78125, "completions/mean_terminated_length": 109.78125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.002436795613768, "grad_norm": 1.0673725605010986, "kl": 1.9755859375, "learning_rate": 2.334085779859041e-06, "loss": 0.0844, "num_tokens": 57374060.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 124.109375, "completions/mean_terminated_length": 124.109375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.003655193420652, "grad_norm": 0.1446772664785385, "kl": 0.384765625, "learning_rate": 2.328621547530987e-06, "loss": 0.0154, "num_tokens": 57389923.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 101.15625, "completions/mean_terminated_length": 101.15625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.004873591227536, "grad_norm": 0.8485239744186401, "kl": 0.9619140625, "learning_rate": 2.3231628757867976e-06, "loss": 0.0368, "num_tokens": 57403285.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 98.75, "completions/mean_terminated_length": 98.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.006091989034419, "grad_norm": 0.2586768865585327, "kl": 0.47900390625, "learning_rate": 2.3177097685831853e-06, "loss": 0.0191, "num_tokens": 57416221.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 101.140625, "completions/mean_terminated_length": 101.140625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.007310386841303, "grad_norm": 1.4777119159698486, "kl": 1.36181640625, "learning_rate": 2.312262229872815e-06, "loss": 0.0064, "num_tokens": 57429198.0, "reward": 1.9296875, "reward_std": 0.19887377321720123, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 114.890625, "completions/mean_terminated_length": 114.890625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.008528784648187, "grad_norm": 1.4331997632980347, "kl": 2.7509765625, "learning_rate": 2.3068202636043335e-06, "loss": 0.1594, "num_tokens": 57443727.0, "reward": 1.9140625, "reward_std": 0.2049122154712677, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 108.4375, "completions/mean_terminated_length": 108.4375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.0097471824550714, "grad_norm": 1.217549443244934, "kl": 1.23095703125, "learning_rate": 2.3013838737223327e-06, "loss": 0.0355, "num_tokens": 57457403.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 3288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 108.421875, "completions/mean_terminated_length": 108.421875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.0109655802619555, "grad_norm": 0.6646640300750732, "kl": 1.34130859375, "learning_rate": 2.295953064167371e-06, "loss": 0.0276, "num_tokens": 57470982.0, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 97.875, "completions/mean_terminated_length": 97.875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 4.0121839780688395, "grad_norm": 0.44594159722328186, "kl": 0.69140625, "learning_rate": 2.2905278388759633e-06, "loss": -0.0023, "num_tokens": 57483902.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 112.625, "completions/mean_terminated_length": 112.625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.013402375875724, "grad_norm": 0.9443323612213135, "kl": 1.77197265625, "learning_rate": 2.2851082017805704e-06, "loss": 0.0769, "num_tokens": 57498566.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 106.765625, "completions/mean_terminated_length": 106.765625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 4.014620773682608, "grad_norm": 0.6314388513565063, "kl": 0.88232421875, "learning_rate": 2.2796941568096076e-06, "loss": -0.0186, "num_tokens": 57512535.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 114.140625, "completions/mean_terminated_length": 114.140625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 4.015839171489492, "grad_norm": 3.048353672027588, "kl": 2.57568359375, "learning_rate": 2.274285707887437e-06, "loss": 0.1468, "num_tokens": 57527072.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 107.671875, "completions/mean_terminated_length": 107.671875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 4.017057569296376, "grad_norm": 0.5624529719352722, "kl": 0.66259765625, "learning_rate": 2.2688828589343594e-06, "loss": 0.0282, "num_tokens": 57540699.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 120.0625, "completions/mean_terminated_length": 105.71429443359375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 4.018275967103259, "grad_norm": 1.8371045589447021, "kl": 4.91748046875, "learning_rate": 2.2634856138666183e-06, "loss": 0.3084, "num_tokens": 57554975.0, "reward": 1.91796875, "reward_std": 0.17854437232017517, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 3295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 88.84375, "completions/mean_terminated_length": 88.84375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 4.019494364910143, "grad_norm": 1.9143024682998657, "kl": 5.111328125, "learning_rate": 2.2580939765963974e-06, "loss": 0.3317, "num_tokens": 57566573.0, "reward": 1.91796875, "reward_std": 0.23201942443847656, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 108.03125, "completions/mean_terminated_length": 108.03125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 4.020712762717027, "grad_norm": 0.7422929406166077, "kl": 0.9716796875, "learning_rate": 2.252707951031814e-06, "loss": 0.0454, "num_tokens": 57580511.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 94.640625, "completions/mean_terminated_length": 94.640625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.021931160523911, "grad_norm": 0.6676145792007446, "kl": 0.6015625, "learning_rate": 2.247327541076921e-06, "loss": 0.0336, "num_tokens": 57593080.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 109.28125, "completions/mean_terminated_length": 109.28125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.023149558330795, "grad_norm": 1.060720443725586, "kl": 2.3662109375, "learning_rate": 2.2419527506316983e-06, "loss": 0.1296, "num_tokens": 57607234.0, "reward": 1.92578125, "reward_std": 0.20992231369018555, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 3299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 104.859375, "completions/mean_terminated_length": 104.859375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.024367956137679, "grad_norm": 1.9314193725585938, "kl": 2.296875, "learning_rate": 2.236583583592049e-06, "loss": 0.141, "num_tokens": 57620873.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 86.015625, "completions/mean_terminated_length": 86.015625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.025586353944563, "grad_norm": 0.6668331027030945, "kl": 0.8408203125, "learning_rate": 2.2312200438498043e-06, "loss": 0.0527, "num_tokens": 57632826.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 99.671875, "completions/mean_terminated_length": 99.671875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.026804751751447, "grad_norm": 1.4822824001312256, "kl": 2.02490234375, "learning_rate": 2.225862135292717e-06, "loss": 0.0319, "num_tokens": 57646301.0, "reward": 1.89453125, "reward_std": 0.25677651166915894, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18662993609905243, "step": 3302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 104.75, "completions/mean_terminated_length": 104.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.028023149558331, "grad_norm": 0.5878494381904602, "kl": 0.65380859375, "learning_rate": 2.2205098618044584e-06, "loss": 0.0269, "num_tokens": 57659869.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 108.265625, "completions/mean_terminated_length": 108.265625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 4.029241547365214, "grad_norm": 0.9585545063018799, "kl": 0.6767578125, "learning_rate": 2.2151632272646094e-06, "loss": 0.0144, "num_tokens": 57674230.0, "reward": 1.99609375, "reward_std": 0.011048543266952038, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 97.5, "completions/mean_terminated_length": 97.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.030459945172098, "grad_norm": 1.306715965270996, "kl": 1.8505859375, "learning_rate": 2.2098222355486697e-06, "loss": 0.0523, "num_tokens": 57686766.0, "reward": 1.91796875, "reward_std": 0.23201942443847656, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 103.546875, "completions/mean_terminated_length": 103.546875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.031678342978982, "grad_norm": 0.22666604816913605, "kl": 0.4443359375, "learning_rate": 2.2044868905280504e-06, "loss": 0.0178, "num_tokens": 57700265.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 112.328125, "completions/mean_terminated_length": 112.328125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.0328967407858665, "grad_norm": 3.0318892002105713, "kl": 1.4052734375, "learning_rate": 2.1991571960700587e-06, "loss": 0.0561, "num_tokens": 57714950.0, "reward": 1.9296875, "reward_std": 0.15255236625671387, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 103.828125, "completions/mean_terminated_length": 103.828125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.0341151385927505, "grad_norm": 0.19526298344135284, "kl": 0.4287109375, "learning_rate": 2.193833156037918e-06, "loss": 0.0172, "num_tokens": 57728595.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 104.53125, "completions/mean_terminated_length": 104.53125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.0353335363996345, "grad_norm": 1.191150188446045, "kl": 1.89990234375, "learning_rate": 2.1885147742907465e-06, "loss": 0.0482, "num_tokens": 57742253.0, "reward": 1.87890625, "reward_std": 0.3425048589706421, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.23034733533859253, "step": 3309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 108.078125, "completions/mean_terminated_length": 108.078125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.036551934206519, "grad_norm": 1.0953506231307983, "kl": 0.98388671875, "learning_rate": 2.183202054683565e-06, "loss": 0.0394, "num_tokens": 57756690.0, "reward": 1.95703125, "reward_std": 0.07999982684850693, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 100.828125, "completions/mean_terminated_length": 100.828125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 4.037770332013403, "grad_norm": 0.5106396079063416, "kl": 0.59814453125, "learning_rate": 2.1778950010672895e-06, "loss": 0.0134, "num_tokens": 57770175.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 113.953125, "completions/mean_terminated_length": 113.953125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.038988729820287, "grad_norm": 0.2051907777786255, "kl": 0.41796875, "learning_rate": 2.172593617288723e-06, "loss": 0.0167, "num_tokens": 57785124.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 101.796875, "completions/mean_terminated_length": 101.796875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.04020712762717, "grad_norm": 0.7630807757377625, "kl": 1.75146484375, "learning_rate": 2.1672979071905677e-06, "loss": 0.0712, "num_tokens": 57798519.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 101.703125, "completions/mean_terminated_length": 101.703125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 4.041425525434054, "grad_norm": 1.1674834489822388, "kl": 2.1298828125, "learning_rate": 2.1620078746114047e-06, "loss": 0.0653, "num_tokens": 57811948.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1883249133825302, "step": 3314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 103.8125, "completions/mean_terminated_length": 103.8125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.042643923240938, "grad_norm": 1.4392918348312378, "kl": 1.7578125, "learning_rate": 2.1567235233857053e-06, "loss": 0.1054, "num_tokens": 57825216.0, "reward": 1.8984375, "reward_std": 0.24572798609733582, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 3315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 98.421875, "completions/mean_terminated_length": 98.421875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.043862321047822, "grad_norm": 1.53208589553833, "kl": 2.53662109375, "learning_rate": 2.151444857343822e-06, "loss": 0.1106, "num_tokens": 57838091.0, "reward": 1.859375, "reward_std": 0.30776649713516235, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09449111670255661, "step": 3316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 100.5, "completions/mean_terminated_length": 100.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.045080718854706, "grad_norm": 0.9913836121559143, "kl": 1.02685546875, "learning_rate": 2.146171880311986e-06, "loss": 0.0667, "num_tokens": 57851211.0, "reward": 1.9921875, "reward_std": 0.022097086533904076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 100.9375, "completions/mean_terminated_length": 100.9375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.04629911666159, "grad_norm": 2.523176908493042, "kl": 4.2890625, "learning_rate": 2.1409045961123067e-06, "loss": 0.27, "num_tokens": 57864431.0, "reward": 1.8671875, "reward_std": 0.3756504952907562, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.20638974010944366, "step": 3318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 103.59375, "completions/mean_terminated_length": 103.59375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 4.047517514468474, "grad_norm": 1.212047815322876, "kl": 1.1259765625, "learning_rate": 2.135643008562759e-06, "loss": 0.0296, "num_tokens": 57877853.0, "reward": 1.8984375, "reward_std": 0.24910639226436615, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.0858980342745781, "step": 3319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 90.953125, "completions/mean_terminated_length": 90.953125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.048735912275358, "grad_norm": 1.1058212518692017, "kl": 1.28076171875, "learning_rate": 2.1303871214772e-06, "loss": 0.0617, "num_tokens": 57890122.0, "reward": 1.88671875, "reward_std": 0.1829560250043869, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.13992053270339966, "step": 3320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 89.296875, "completions/mean_terminated_length": 89.296875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.049954310082242, "grad_norm": 2.081711530685425, "kl": 2.94091796875, "learning_rate": 2.1251369386653454e-06, "loss": 0.1275, "num_tokens": 57902621.0, "reward": 1.87109375, "reward_std": 0.27674636244773865, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 3321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 113.03125, "completions/mean_terminated_length": 113.03125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.051172707889126, "grad_norm": 0.988120436668396, "kl": 0.92626953125, "learning_rate": 2.119892463932781e-06, "loss": 0.0566, "num_tokens": 57917623.0, "reward": 1.92578125, "reward_std": 0.17176657915115356, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 3322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 77.09375, "completions/mean_terminated_length": 77.09375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 4.052391105696009, "grad_norm": 1.0053017139434814, "kl": 0.67236328125, "learning_rate": 2.1146537010809555e-06, "loss": -0.0084, "num_tokens": 57928509.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 89.0625, "completions/mean_terminated_length": 89.0625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.053609503502893, "grad_norm": 1.1469519138336182, "kl": 0.98388671875, "learning_rate": 2.109420653907176e-06, "loss": 0.0763, "num_tokens": 57940441.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 3324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 113.546875, "completions/mean_terminated_length": 113.546875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.054827901309777, "grad_norm": 0.8087908625602722, "kl": 1.63134765625, "learning_rate": 2.104193326204608e-06, "loss": 0.0596, "num_tokens": 57955620.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 86.09375, "completions/mean_terminated_length": 86.09375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.0560462991166615, "grad_norm": 1.6720294952392578, "kl": 2.2958984375, "learning_rate": 2.0989717217622652e-06, "loss": 0.1441, "num_tokens": 57967882.0, "reward": 1.9453125, "reward_std": 0.1165238618850708, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 112.796875, "completions/mean_terminated_length": 112.796875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 4.0572646969235455, "grad_norm": 0.3751678764820099, "kl": 0.5634765625, "learning_rate": 2.093755844365021e-06, "loss": -0.0025, "num_tokens": 57982997.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 107.4375, "completions/mean_terminated_length": 107.4375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.05848309473043, "grad_norm": 0.6022796034812927, "kl": 0.716796875, "learning_rate": 2.0885456977935924e-06, "loss": 0.0086, "num_tokens": 57997361.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 87.8125, "completions/mean_terminated_length": 87.8125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 4.059701492537314, "grad_norm": 0.7494435906410217, "kl": 0.8798828125, "learning_rate": 2.0833412858245482e-06, "loss": 0.0221, "num_tokens": 58009701.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 81.5, "completions/mean_terminated_length": 81.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.060919890344198, "grad_norm": 0.6797063946723938, "kl": 0.66845703125, "learning_rate": 2.078142612230292e-06, "loss": 0.0167, "num_tokens": 58021493.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 82.15625, "completions/mean_terminated_length": 82.15625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.062138288151082, "grad_norm": 1.0559314489364624, "kl": 0.591796875, "learning_rate": 2.0729496807790737e-06, "loss": 0.0387, "num_tokens": 58033231.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 111.421875, "completions/mean_terminated_length": 111.421875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.063356685957965, "grad_norm": 0.7564180493354797, "kl": 1.5126953125, "learning_rate": 2.0677624952349783e-06, "loss": 0.0913, "num_tokens": 58047986.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 105.671875, "completions/mean_terminated_length": 105.671875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.064575083764849, "grad_norm": 0.298406720161438, "kl": 0.4296875, "learning_rate": 2.0625810593579286e-06, "loss": 0.0172, "num_tokens": 58062517.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 83.125, "completions/mean_terminated_length": 83.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.065793481571733, "grad_norm": 1.3388761281967163, "kl": 1.30224609375, "learning_rate": 2.0574053769036774e-06, "loss": 0.0787, "num_tokens": 58074245.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 106.65625, "completions/mean_terminated_length": 106.65625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.067011879378617, "grad_norm": 1.0530160665512085, "kl": 1.630859375, "learning_rate": 2.0522354516238097e-06, "loss": 0.0602, "num_tokens": 58088439.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 92.15625, "completions/mean_terminated_length": 92.15625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.068230277185501, "grad_norm": 0.7925376296043396, "kl": 1.10205078125, "learning_rate": 2.047071287265735e-06, "loss": 0.0181, "num_tokens": 58101265.0, "reward": 1.953125, "reward_std": 0.09300297498703003, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 95.640625, "completions/mean_terminated_length": 95.640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 4.069448674992385, "grad_norm": 1.4860281944274902, "kl": 1.22314453125, "learning_rate": 2.04191288757269e-06, "loss": 0.0298, "num_tokens": 58114274.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 86.765625, "completions/mean_terminated_length": 86.765625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.070667072799269, "grad_norm": 1.0554871559143066, "kl": 1.19677734375, "learning_rate": 2.0367602562837297e-06, "loss": 0.064, "num_tokens": 58126435.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 102.65625, "completions/mean_terminated_length": 102.65625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.071885470606153, "grad_norm": 0.7658416032791138, "kl": 0.60595703125, "learning_rate": 2.0316133971337236e-06, "loss": 0.0345, "num_tokens": 58140277.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 94.96875, "completions/mean_terminated_length": 94.96875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.073103868413037, "grad_norm": 1.2785346508026123, "kl": 1.93310546875, "learning_rate": 2.0264723138533694e-06, "loss": 0.0662, "num_tokens": 58153283.0, "reward": 1.953125, "reward_std": 0.09300297498703003, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 101.1875, "completions/mean_terminated_length": 101.1875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.074322266219921, "grad_norm": 0.2909855544567108, "kl": 0.4287109375, "learning_rate": 2.0213370101691675e-06, "loss": 0.0171, "num_tokens": 58167623.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 84.09375, "completions/mean_terminated_length": 84.09375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 4.075540664026804, "grad_norm": 0.7977263927459717, "kl": 1.04443359375, "learning_rate": 2.0162074898034347e-06, "loss": -0.0096, "num_tokens": 58179517.0, "reward": 1.91796875, "reward_std": 0.185698002576828, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 92.0625, "completions/mean_terminated_length": 92.0625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.076759061833688, "grad_norm": 0.5706539750099182, "kl": 0.4541015625, "learning_rate": 2.0110837564742936e-06, "loss": 0.0355, "num_tokens": 58192497.0, "reward": 1.98828125, "reward_std": 0.03314562886953354, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 3343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 96.5625, "completions/mean_terminated_length": 96.5625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.0779774596405725, "grad_norm": 1.1125138998031616, "kl": 2.90771484375, "learning_rate": 2.005965813895675e-06, "loss": 0.1345, "num_tokens": 58205749.0, "reward": 1.90234375, "reward_std": 0.24568898975849152, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09241779148578644, "step": 3344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 75.078125, "completions/mean_terminated_length": 75.078125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.0791958574474565, "grad_norm": 0.865010142326355, "kl": 0.658203125, "learning_rate": 2.000853665777305e-06, "loss": 0.0033, "num_tokens": 58216898.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 100.296875, "completions/mean_terminated_length": 100.296875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.0804142552543405, "grad_norm": 1.0796235799789429, "kl": 1.12548828125, "learning_rate": 1.9957473158247153e-06, "loss": 0.0578, "num_tokens": 58230477.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 105.53125, "completions/mean_terminated_length": 105.53125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.081632653061225, "grad_norm": 0.7885448932647705, "kl": 1.10595703125, "learning_rate": 1.9906467677392382e-06, "loss": 0.0313, "num_tokens": 58244815.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 104.75, "completions/mean_terminated_length": 104.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 4.082851050868109, "grad_norm": 0.33670249581336975, "kl": 0.625, "learning_rate": 1.9855520252179903e-06, "loss": 0.0034, "num_tokens": 58259191.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 91.21875, "completions/mean_terminated_length": 91.21875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.084069448674993, "grad_norm": 0.5906117558479309, "kl": 0.7666015625, "learning_rate": 1.9804630919538882e-06, "loss": 0.0343, "num_tokens": 58271677.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 98.890625, "completions/mean_terminated_length": 98.890625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.085287846481877, "grad_norm": 0.8831138014793396, "kl": 0.80224609375, "learning_rate": 1.9753799716356394e-06, "loss": 0.0648, "num_tokens": 58285246.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 98.90625, "completions/mean_terminated_length": 98.90625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.08650624428876, "grad_norm": 0.6494261622428894, "kl": 1.2783203125, "learning_rate": 1.9703026679477253e-06, "loss": 0.0546, "num_tokens": 58298648.0, "reward": 1.95703125, "reward_std": 0.07999982684850693, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 86.375, "completions/mean_terminated_length": 86.375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.087724642095644, "grad_norm": 0.17749913036823273, "kl": 0.46728515625, "learning_rate": 1.9652311845704266e-06, "loss": 0.0187, "num_tokens": 58310304.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 95.015625, "completions/mean_terminated_length": 95.015625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.088943039902528, "grad_norm": 0.5464401841163635, "kl": 1.0703125, "learning_rate": 1.9601655251797947e-06, "loss": 0.0632, "num_tokens": 58323409.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 88.03125, "completions/mean_terminated_length": 88.03125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.090161437709412, "grad_norm": 1.496424674987793, "kl": 1.17626953125, "learning_rate": 1.9551056934476653e-06, "loss": 0.0771, "num_tokens": 58335515.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08768405020236969, "step": 3354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 89.296875, "completions/mean_terminated_length": 89.296875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 4.091379835516296, "grad_norm": 0.742116391658783, "kl": 1.59765625, "learning_rate": 1.950051693041646e-06, "loss": 0.0282, "num_tokens": 58347966.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 87.578125, "completions/mean_terminated_length": 87.578125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.09259823332318, "grad_norm": 0.7524446845054626, "kl": 0.984375, "learning_rate": 1.9450035276251224e-06, "loss": 0.01, "num_tokens": 58360083.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 96.0625, "completions/mean_terminated_length": 96.0625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.093816631130064, "grad_norm": 0.7304641008377075, "kl": 0.3681640625, "learning_rate": 1.9399612008572455e-06, "loss": 0.0235, "num_tokens": 58373143.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 84.703125, "completions/mean_terminated_length": 84.703125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.095035028936948, "grad_norm": 1.0818705558776855, "kl": 0.7939453125, "learning_rate": 1.934924716392933e-06, "loss": -0.0009, "num_tokens": 58385388.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 84.34375, "completions/mean_terminated_length": 84.34375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.096253426743832, "grad_norm": 1.7337214946746826, "kl": 2.05126953125, "learning_rate": 1.9298940778828747e-06, "loss": 0.1431, "num_tokens": 58397314.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 81.09375, "completions/mean_terminated_length": 81.09375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.097471824550716, "grad_norm": 0.8762800097465515, "kl": 0.548828125, "learning_rate": 1.9248692889735165e-06, "loss": 0.022, "num_tokens": 58408808.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 90.9375, "completions/mean_terminated_length": 90.9375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.098690222357599, "grad_norm": 0.5516132712364197, "kl": 0.7470703125, "learning_rate": 1.9198503533070688e-06, "loss": 0.0288, "num_tokens": 58421324.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 98.984375, "completions/mean_terminated_length": 98.984375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.099908620164483, "grad_norm": 1.1109602451324463, "kl": 0.74755859375, "learning_rate": 1.9148372745215006e-06, "loss": 0.0322, "num_tokens": 58434851.0, "reward": 1.9453125, "reward_std": 0.15467961132526398, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 106.921875, "completions/mean_terminated_length": 106.921875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.1011270179713675, "grad_norm": 2.3592162132263184, "kl": 4.32275390625, "learning_rate": 1.9098300562505266e-06, "loss": 0.3963, "num_tokens": 58448502.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 95.390625, "completions/mean_terminated_length": 95.390625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.1023454157782515, "grad_norm": 0.6460157632827759, "kl": 0.64013671875, "learning_rate": 1.9048287021236233e-06, "loss": 0.0311, "num_tokens": 58461519.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 104.1875, "completions/mean_terminated_length": 104.1875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.1035638135851356, "grad_norm": 0.8869296908378601, "kl": 1.54443359375, "learning_rate": 1.8998332157660093e-06, "loss": 0.108, "num_tokens": 58475003.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 108.234375, "completions/mean_terminated_length": 108.234375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.10478221139202, "grad_norm": 0.6428377032279968, "kl": 0.66748046875, "learning_rate": 1.894843600798655e-06, "loss": 0.0307, "num_tokens": 58489650.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 98.109375, "completions/mean_terminated_length": 98.109375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 4.106000609198904, "grad_norm": 1.2004053592681885, "kl": 0.7255859375, "learning_rate": 1.8898598608382723e-06, "loss": -0.003, "num_tokens": 58503345.0, "reward": 1.921875, "reward_std": 0.16405825316905975, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 3367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 121.546875, "completions/mean_terminated_length": 121.546875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.107219007005788, "grad_norm": 1.5246907472610474, "kl": 2.2763671875, "learning_rate": 1.8848819994973167e-06, "loss": 0.1085, "num_tokens": 58519372.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 94.46875, "completions/mean_terminated_length": 94.46875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.108437404812672, "grad_norm": 0.9973763823509216, "kl": 1.37646484375, "learning_rate": 1.8799100203839837e-06, "loss": 0.0461, "num_tokens": 58532082.0, "reward": 1.96484375, "reward_std": 0.06563031673431396, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 104.546875, "completions/mean_terminated_length": 104.546875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.109655802619555, "grad_norm": 1.0305113792419434, "kl": 2.15234375, "learning_rate": 1.8749439271021973e-06, "loss": 0.0697, "num_tokens": 58546005.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 94.125, "completions/mean_terminated_length": 94.125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.110874200426439, "grad_norm": 0.8294042348861694, "kl": 0.83544921875, "learning_rate": 1.8699837232516226e-06, "loss": 0.0176, "num_tokens": 58559093.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 98.296875, "completions/mean_terminated_length": 98.296875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.112092598233323, "grad_norm": 0.6908096671104431, "kl": 0.91162109375, "learning_rate": 1.8650294124276558e-06, "loss": 0.0452, "num_tokens": 58572792.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 92.125, "completions/mean_terminated_length": 92.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.113310996040207, "grad_norm": 2.272204637527466, "kl": 1.8408203125, "learning_rate": 1.8600809982214206e-06, "loss": 0.038, "num_tokens": 58586232.0, "reward": 1.87109375, "reward_std": 0.2719590961933136, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.20009763538837433, "step": 3373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 101.1875, "completions/mean_terminated_length": 101.1875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.114529393847091, "grad_norm": 1.1488088369369507, "kl": 1.08984375, "learning_rate": 1.8551384842197607e-06, "loss": 0.0285, "num_tokens": 58599788.0, "reward": 1.9296875, "reward_std": 0.16071803867816925, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 102.0625, "completions/mean_terminated_length": 102.0625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.115747791653975, "grad_norm": 1.5157639980316162, "kl": 2.75439453125, "learning_rate": 1.8502018740052496e-06, "loss": 0.0951, "num_tokens": 58613856.0, "reward": 1.8671875, "reward_std": 0.2604832649230957, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.21578919887542725, "step": 3375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 89.0625, "completions/mean_terminated_length": 89.0625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.116966189460859, "grad_norm": 1.5881952047348022, "kl": 1.7294921875, "learning_rate": 1.8452711711561845e-06, "loss": 0.1061, "num_tokens": 58626836.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 97.515625, "completions/mean_terminated_length": 97.515625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.118184587267743, "grad_norm": 1.167265772819519, "kl": 2.29736328125, "learning_rate": 1.840346379246567e-06, "loss": 0.1372, "num_tokens": 58639917.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 100.5, "completions/mean_terminated_length": 100.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 4.119402985074627, "grad_norm": 1.1488467454910278, "kl": 1.3486328125, "learning_rate": 1.8354275018461298e-06, "loss": -0.0048, "num_tokens": 58653461.0, "reward": 1.88671875, "reward_std": 0.25935858488082886, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.21445617079734802, "step": 3378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 104.328125, "completions/mean_terminated_length": 104.328125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.120621382881511, "grad_norm": 0.7301614880561829, "kl": 0.84912109375, "learning_rate": 1.8305145425203085e-06, "loss": 0.0173, "num_tokens": 58667578.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 95.65625, "completions/mean_terminated_length": 95.65625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.121839780688394, "grad_norm": 1.2169111967086792, "kl": 1.59033203125, "learning_rate": 1.8256075048302535e-06, "loss": 0.0336, "num_tokens": 58680708.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 96.78125, "completions/mean_terminated_length": 96.78125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.1230581784952784, "grad_norm": 3.067965507507324, "kl": 2.79833984375, "learning_rate": 1.820706392332824e-06, "loss": 0.1099, "num_tokens": 58693726.0, "reward": 1.890625, "reward_std": 0.30935922265052795, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 3381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 108.609375, "completions/mean_terminated_length": 108.609375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.1242765763021625, "grad_norm": 0.5863792300224304, "kl": 0.8525390625, "learning_rate": 1.8158112085805801e-06, "loss": 0.0031, "num_tokens": 58707277.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 103.890625, "completions/mean_terminated_length": 103.890625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.1254949741090465, "grad_norm": 0.929050087928772, "kl": 1.8798828125, "learning_rate": 1.8109219571217818e-06, "loss": 0.0479, "num_tokens": 58721182.0, "reward": 1.8828125, "reward_std": 0.29187676310539246, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.17938801646232605, "step": 3383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 103.234375, "completions/mean_terminated_length": 103.234375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 4.126713371915931, "grad_norm": 0.39891448616981506, "kl": 0.572265625, "learning_rate": 1.8060386415003983e-06, "loss": 0.004, "num_tokens": 58735381.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 98.765625, "completions/mean_terminated_length": 98.765625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 4.127931769722815, "grad_norm": 1.0708922147750854, "kl": 1.37744140625, "learning_rate": 1.8011612652560907e-06, "loss": 0.0285, "num_tokens": 58749150.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 83.0, "completions/mean_terminated_length": 83.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.129150167529699, "grad_norm": 0.8452755808830261, "kl": 1.02685546875, "learning_rate": 1.796289831924215e-06, "loss": 0.0411, "num_tokens": 58760854.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 102.203125, "completions/mean_terminated_length": 102.203125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.130368565336583, "grad_norm": 0.6467204689979553, "kl": 1.06396484375, "learning_rate": 1.7914243450358215e-06, "loss": 0.033, "num_tokens": 58774755.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 83.15625, "completions/mean_terminated_length": 83.15625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.131586963143467, "grad_norm": 0.5106728076934814, "kl": 0.82373046875, "learning_rate": 1.7865648081176501e-06, "loss": 0.0306, "num_tokens": 58786741.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 100.4375, "completions/mean_terminated_length": 100.4375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.13280536095035, "grad_norm": 0.6431139707565308, "kl": 1.279296875, "learning_rate": 1.7817112246921232e-06, "loss": 0.0057, "num_tokens": 58801057.0, "reward": 1.91796875, "reward_std": 0.185698002576828, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 114.109375, "completions/mean_terminated_length": 114.109375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.134023758757234, "grad_norm": 0.8167979121208191, "kl": 1.5078125, "learning_rate": 1.776863598277353e-06, "loss": 0.0007, "num_tokens": 58816240.0, "reward": 1.921875, "reward_std": 0.22097086906433105, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 111.328125, "completions/mean_terminated_length": 111.328125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.135242156564118, "grad_norm": 1.0756851434707642, "kl": 0.5498046875, "learning_rate": 1.7720219323871346e-06, "loss": 0.0305, "num_tokens": 58830797.0, "reward": 1.99609375, "reward_std": 0.011048543266952038, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 93.0625, "completions/mean_terminated_length": 93.0625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.136460554371002, "grad_norm": 0.980718731880188, "kl": 3.12939453125, "learning_rate": 1.767186230530935e-06, "loss": 0.1894, "num_tokens": 58843577.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 104.65625, "completions/mean_terminated_length": 104.65625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.137678952177886, "grad_norm": 1.414219617843628, "kl": 2.693359375, "learning_rate": 1.7623564962139061e-06, "loss": 0.1612, "num_tokens": 58857339.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 3393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 102.953125, "completions/mean_terminated_length": 102.953125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 4.13889734998477, "grad_norm": 0.5907962918281555, "kl": 0.96630859375, "learning_rate": 1.757532732936873e-06, "loss": -0.0197, "num_tokens": 58870688.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 83.0, "completions/mean_terminated_length": 83.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.140115747791654, "grad_norm": 1.4976065158843994, "kl": 3.501953125, "learning_rate": 1.752714944196332e-06, "loss": 0.2308, "num_tokens": 58881968.0, "reward": 1.89453125, "reward_std": 0.29831066727638245, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18662993609905243, "step": 3395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 107.8125, "completions/mean_terminated_length": 107.8125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.141334145598538, "grad_norm": 5.780910968780518, "kl": 1.3466796875, "learning_rate": 1.7479031334844421e-06, "loss": 0.0244, "num_tokens": 58896372.0, "reward": 1.94140625, "reward_std": 0.10881553590297699, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1550549566745758, "step": 3396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 97.828125, "completions/mean_terminated_length": 97.828125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.142552543405422, "grad_norm": 0.6031279563903809, "kl": 0.78271484375, "learning_rate": 1.7430973042890399e-06, "loss": 0.0276, "num_tokens": 58909657.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 107.984375, "completions/mean_terminated_length": 107.984375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.143770941212306, "grad_norm": 1.3788105249404907, "kl": 2.68115234375, "learning_rate": 1.7382974600936198e-06, "loss": 0.0772, "num_tokens": 58923704.0, "reward": 1.875, "reward_std": 0.268692284822464, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18298126757144928, "step": 3398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 99.21875, "completions/mean_terminated_length": 99.21875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.144989339019189, "grad_norm": 1.6494852304458618, "kl": 1.640625, "learning_rate": 1.7335036043773402e-06, "loss": 0.0741, "num_tokens": 58937358.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 123.59375, "completions/mean_terminated_length": 123.59375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.1462077368260735, "grad_norm": 2.2478654384613037, "kl": 4.4189453125, "learning_rate": 1.7287157406150213e-06, "loss": 0.2664, "num_tokens": 58952884.0, "reward": 1.91015625, "reward_std": 0.20832960307598114, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.10076284408569336, "step": 3400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 98.15625, "completions/mean_terminated_length": 98.15625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.1474261346329575, "grad_norm": 1.3515349626541138, "kl": 2.78955078125, "learning_rate": 1.7239338722771326e-06, "loss": 0.1422, "num_tokens": 58965934.0, "reward": 1.9140625, "reward_std": 0.19728106260299683, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 3401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 106.546875, "completions/mean_terminated_length": 106.546875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 4.1486445324398415, "grad_norm": 1.3669525384902954, "kl": 2.6904296875, "learning_rate": 1.7191580028298006e-06, "loss": 0.135, "num_tokens": 58980097.0, "reward": 1.92578125, "reward_std": 0.20992231369018555, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 3402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 101.3125, "completions/mean_terminated_length": 101.3125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 4.149862930246726, "grad_norm": 1.4851890802383423, "kl": 2.10986328125, "learning_rate": 1.714388135734808e-06, "loss": 0.1358, "num_tokens": 58993317.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 107.859375, "completions/mean_terminated_length": 107.859375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.15108132805361, "grad_norm": 0.4749124050140381, "kl": 0.94091796875, "learning_rate": 1.709624274449584e-06, "loss": 0.0451, "num_tokens": 59007812.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 93.78125, "completions/mean_terminated_length": 93.78125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.152299725860494, "grad_norm": 1.1088509559631348, "kl": 1.09228515625, "learning_rate": 1.7048664224272027e-06, "loss": 0.0464, "num_tokens": 59020582.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 115.25, "completions/mean_terminated_length": 115.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.153518123667378, "grad_norm": 0.8351585865020752, "kl": 0.81103515625, "learning_rate": 1.7001145831163845e-06, "loss": 0.0389, "num_tokens": 59035318.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 121.9375, "completions/mean_terminated_length": 121.9375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 4.154736521474262, "grad_norm": 2.831439971923828, "kl": 3.00439453125, "learning_rate": 1.6953687599614954e-06, "loss": 0.0772, "num_tokens": 59050842.0, "reward": 1.8515625, "reward_std": 0.3629320561885834, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.24587368965148926, "step": 3407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 115.984375, "completions/mean_terminated_length": 115.984375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 4.155954919281145, "grad_norm": 1.072355031967163, "kl": 1.623046875, "learning_rate": 1.690628956402528e-06, "loss": 0.0945, "num_tokens": 59065913.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08768405020236969, "step": 3408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 100.25, "completions/mean_terminated_length": 100.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 4.157173317088029, "grad_norm": 0.7777761220932007, "kl": 1.24609375, "learning_rate": 1.6858951758751273e-06, "loss": 0.0381, "num_tokens": 59079657.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 115.203125, "completions/mean_terminated_length": 115.203125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 4.158391714894913, "grad_norm": 0.9294971823692322, "kl": 0.90234375, "learning_rate": 1.6811674218105588e-06, "loss": 0.0319, "num_tokens": 59094126.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 116.171875, "completions/mean_terminated_length": 116.171875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 4.159610112701797, "grad_norm": 1.0481559038162231, "kl": 4.1455078125, "learning_rate": 1.6764456976357279e-06, "loss": 0.244, "num_tokens": 59109193.0, "reward": 1.90234375, "reward_std": 0.2151644080877304, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 111.765625, "completions/mean_terminated_length": 111.765625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.160828510508681, "grad_norm": 1.298581838607788, "kl": 1.39208984375, "learning_rate": 1.671730006773169e-06, "loss": 0.0786, "num_tokens": 59123706.0, "reward": 1.96484375, "reward_std": 0.09943688660860062, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 118.140625, "completions/mean_terminated_length": 118.140625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 4.162046908315565, "grad_norm": 1.6237605810165405, "kl": 4.23291015625, "learning_rate": 1.66702035264104e-06, "loss": 0.2449, "num_tokens": 59138667.0, "reward": 1.9140625, "reward_std": 0.24306795001029968, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.17938801646232605, "step": 3413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 93.3125, "completions/mean_terminated_length": 93.3125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 4.163265306122449, "grad_norm": 0.8977159261703491, "kl": 0.9482421875, "learning_rate": 1.662316738653128e-06, "loss": 0.0192, "num_tokens": 59151575.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 91.46875, "completions/mean_terminated_length": 91.46875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.164483703929333, "grad_norm": 0.1392914205789566, "kl": 0.43701171875, "learning_rate": 1.6576191682188336e-06, "loss": 0.0175, "num_tokens": 59163725.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 106.625, "completions/mean_terminated_length": 106.625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.165702101736217, "grad_norm": 0.6323865652084351, "kl": 1.19287109375, "learning_rate": 1.652927644743183e-06, "loss": -0.0225, "num_tokens": 59177613.0, "reward": 1.90625, "reward_std": 0.2041158676147461, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 3416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 96.09375, "completions/mean_terminated_length": 96.09375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.1669204995431, "grad_norm": 0.8955700993537903, "kl": 1.73486328125, "learning_rate": 1.6482421716268215e-06, "loss": 0.0698, "num_tokens": 59190539.0, "reward": 1.9453125, "reward_std": 0.10263003408908844, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 114.34375, "completions/mean_terminated_length": 114.34375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.168138897349984, "grad_norm": 1.2691971063613892, "kl": 5.423828125, "learning_rate": 1.643562752265999e-06, "loss": 0.3285, "num_tokens": 59204993.0, "reward": 1.85546875, "reward_std": 0.3104251027107239, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.19507674872875214, "step": 3418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 104.09375, "completions/mean_terminated_length": 104.09375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.1693572951568685, "grad_norm": 0.8183091878890991, "kl": 1.232421875, "learning_rate": 1.6388893900525882e-06, "loss": 0.001, "num_tokens": 59218879.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1550549566745758, "step": 3419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 118.875, "completions/mean_terminated_length": 118.875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 4.1705756929637525, "grad_norm": 2.370593547821045, "kl": 6.162109375, "learning_rate": 1.6342220883740667e-06, "loss": 0.3843, "num_tokens": 59233391.0, "reward": 1.8984375, "reward_std": 0.24094071984291077, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 93.234375, "completions/mean_terminated_length": 93.234375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.171794090770637, "grad_norm": 1.0057780742645264, "kl": 1.48095703125, "learning_rate": 1.6295608506135162e-06, "loss": -0.0159, "num_tokens": 59246230.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 3421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 107.78125, "completions/mean_terminated_length": 107.78125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.173012488577521, "grad_norm": 1.3434088230133057, "kl": 3.3525390625, "learning_rate": 1.6249056801496277e-06, "loss": 0.2844, "num_tokens": 59259480.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 101.65625, "completions/mean_terminated_length": 101.65625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.174230886384405, "grad_norm": 0.8044195175170898, "kl": 0.6279296875, "learning_rate": 1.6202565803566917e-06, "loss": 0.0465, "num_tokens": 59272970.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 93.234375, "completions/mean_terminated_length": 93.234375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.175449284191289, "grad_norm": 2.9116005897521973, "kl": 0.85888671875, "learning_rate": 1.6156135546046003e-06, "loss": 0.0134, "num_tokens": 59286001.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 96.859375, "completions/mean_terminated_length": 96.859375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.176667681998173, "grad_norm": 3.8349790573120117, "kl": 1.02099609375, "learning_rate": 1.6109766062588416e-06, "loss": 0.0669, "num_tokens": 59298904.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 92.328125, "completions/mean_terminated_length": 92.328125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 4.177886079805057, "grad_norm": 0.9615126848220825, "kl": 1.44775390625, "learning_rate": 1.6063457386805004e-06, "loss": -0.0386, "num_tokens": 59311261.0, "reward": 1.87890625, "reward_std": 0.2855922281742096, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.23034733533859253, "step": 3426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 98.296875, "completions/mean_terminated_length": 98.296875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.17910447761194, "grad_norm": 1.0879719257354736, "kl": 1.9638671875, "learning_rate": 1.6017209552262513e-06, "loss": 0.0484, "num_tokens": 59324344.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 3427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 105.0625, "completions/mean_terminated_length": 105.0625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 4.180322875418824, "grad_norm": 1.9450162649154663, "kl": 2.82421875, "learning_rate": 1.5971022592483543e-06, "loss": 0.1941, "num_tokens": 59337244.0, "reward": 1.8828125, "reward_std": 0.27040714025497437, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.21578919887542725, "step": 3428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 91.953125, "completions/mean_terminated_length": 91.953125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.181541273225708, "grad_norm": 1.5815356969833374, "kl": 2.19970703125, "learning_rate": 1.5924896540946677e-06, "loss": 0.0923, "num_tokens": 59349609.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 102.296875, "completions/mean_terminated_length": 102.296875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.182759671032592, "grad_norm": 0.9511874914169312, "kl": 1.06787109375, "learning_rate": 1.5878831431086284e-06, "loss": 0.0118, "num_tokens": 59363340.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 99.546875, "completions/mean_terminated_length": 99.546875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.183978068839476, "grad_norm": 0.510164201259613, "kl": 0.447265625, "learning_rate": 1.5832827296292564e-06, "loss": 0.0098, "num_tokens": 59376791.0, "reward": 1.99609375, "reward_std": 0.011048543266952038, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 118.71875, "completions/mean_terminated_length": 118.71875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.18519646664636, "grad_norm": 0.8949517607688904, "kl": 1.2607421875, "learning_rate": 1.5786884169911543e-06, "loss": 0.0146, "num_tokens": 59392109.0, "reward": 1.9140625, "reward_std": 0.24306795001029968, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 3432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 121.90625, "completions/mean_terminated_length": 121.90625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.186414864453244, "grad_norm": 1.2081894874572754, "kl": 3.26806640625, "learning_rate": 1.5741002085244983e-06, "loss": 0.2221, "num_tokens": 59407431.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 3433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 99.5, "completions/mean_terminated_length": 99.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.187633262260128, "grad_norm": 0.3319830596446991, "kl": 0.37158203125, "learning_rate": 1.5695181075550436e-06, "loss": 0.0149, "num_tokens": 59420847.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 89.796875, "completions/mean_terminated_length": 89.796875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.188851660067012, "grad_norm": 0.17400163412094116, "kl": 0.38623046875, "learning_rate": 1.564942117404119e-06, "loss": 0.0154, "num_tokens": 59433778.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 102.65625, "completions/mean_terminated_length": 102.65625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 4.190070057873895, "grad_norm": 1.4192432165145874, "kl": 2.466796875, "learning_rate": 1.560372241388618e-06, "loss": 0.1843, "num_tokens": 59447524.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 107.609375, "completions/mean_terminated_length": 107.609375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.1912884556807795, "grad_norm": 1.020759105682373, "kl": 1.08740234375, "learning_rate": 1.5558084828210086e-06, "loss": -0.0135, "num_tokens": 59461955.0, "reward": 1.93359375, "reward_std": 0.18782523274421692, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 99.34375, "completions/mean_terminated_length": 99.34375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.1925068534876635, "grad_norm": 0.15598556399345398, "kl": 0.3857421875, "learning_rate": 1.5512508450093233e-06, "loss": 0.0154, "num_tokens": 59475137.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 105.1875, "completions/mean_terminated_length": 105.1875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.1937252512945475, "grad_norm": 1.112154245376587, "kl": 2.0849609375, "learning_rate": 1.5466993312571577e-06, "loss": 0.0746, "num_tokens": 59489005.0, "reward": 1.92578125, "reward_std": 0.1578727513551712, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 3439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 93.59375, "completions/mean_terminated_length": 93.59375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.194943649101432, "grad_norm": 1.3872567415237427, "kl": 1.66552734375, "learning_rate": 1.5421539448636647e-06, "loss": 0.0181, "num_tokens": 59501611.0, "reward": 1.921875, "reward_std": 0.22097086906433105, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 113.328125, "completions/mean_terminated_length": 113.328125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.196162046908316, "grad_norm": 1.3381801843643188, "kl": 1.8095703125, "learning_rate": 1.53761468912356e-06, "loss": 0.0828, "num_tokens": 59515864.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 84.71875, "completions/mean_terminated_length": 84.71875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.1973804447152, "grad_norm": 2.04728102684021, "kl": 1.4560546875, "learning_rate": 1.533081567327115e-06, "loss": -0.008, "num_tokens": 59527638.0, "reward": 1.91015625, "reward_std": 0.2541165053844452, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.19697457551956177, "step": 3442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 98.3125, "completions/mean_terminated_length": 98.3125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.198598842522084, "grad_norm": 0.4081779420375824, "kl": 0.59423828125, "learning_rate": 1.5285545827601545e-06, "loss": -0.0182, "num_tokens": 59541050.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 86.09375, "completions/mean_terminated_length": 86.09375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.199817240328968, "grad_norm": 0.9551244378089905, "kl": 1.09130859375, "learning_rate": 1.524033738704056e-06, "loss": 0.0031, "num_tokens": 59552536.0, "reward": 1.9296875, "reward_std": 0.19887377321720123, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 101.46875, "completions/mean_terminated_length": 101.46875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.201035638135851, "grad_norm": 0.6725949048995972, "kl": 0.88525390625, "learning_rate": 1.5195190384357405e-06, "loss": 0.0574, "num_tokens": 59565774.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 98.4375, "completions/mean_terminated_length": 98.4375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.202254035942735, "grad_norm": 0.5913909673690796, "kl": 0.6640625, "learning_rate": 1.5150104852276847e-06, "loss": 0.0286, "num_tokens": 59579170.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 98.703125, "completions/mean_terminated_length": 98.703125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.203472433749619, "grad_norm": 1.048985242843628, "kl": 1.3427734375, "learning_rate": 1.510508082347899e-06, "loss": 0.0293, "num_tokens": 59592391.0, "reward": 1.9140625, "reward_std": 0.24306795001029968, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.17938801646232605, "step": 3447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 124.9375, "completions/mean_terminated_length": 124.9375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.204690831556503, "grad_norm": 0.84678715467453, "kl": 0.7587890625, "learning_rate": 1.5060118330599448e-06, "loss": 0.0315, "num_tokens": 59608123.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 101.0625, "completions/mean_terminated_length": 101.0625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.205909229363387, "grad_norm": 0.7183091044425964, "kl": 1.12939453125, "learning_rate": 1.50152174062292e-06, "loss": 0.0448, "num_tokens": 59621607.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 93.96875, "completions/mean_terminated_length": 93.96875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.207127627170271, "grad_norm": 1.0059226751327515, "kl": 1.61572265625, "learning_rate": 1.4970378082914583e-06, "loss": 0.0641, "num_tokens": 59634493.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 92.34375, "completions/mean_terminated_length": 92.34375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 4.208346024977155, "grad_norm": 0.6883041858673096, "kl": 0.8857421875, "learning_rate": 1.4925600393157325e-06, "loss": 0.0045, "num_tokens": 59646963.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 111.78125, "completions/mean_terminated_length": 111.78125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 4.209564422784039, "grad_norm": 0.9901679754257202, "kl": 1.234375, "learning_rate": 1.4880884369414394e-06, "loss": 0.0282, "num_tokens": 59661533.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 116.078125, "completions/mean_terminated_length": 116.078125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.210782820590923, "grad_norm": 0.13355965912342072, "kl": 0.3564453125, "learning_rate": 1.4836230044098164e-06, "loss": 0.0143, "num_tokens": 59676626.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 100.015625, "completions/mean_terminated_length": 100.015625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.212001218397807, "grad_norm": 1.0190314054489136, "kl": 1.9755859375, "learning_rate": 1.4791637449576202e-06, "loss": 0.0909, "num_tokens": 59690723.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 104.375, "completions/mean_terminated_length": 104.375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.21321961620469, "grad_norm": 1.9265464544296265, "kl": 1.6865234375, "learning_rate": 1.474710661817137e-06, "loss": 0.0547, "num_tokens": 59704627.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11545931547880173, "step": 3455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 100.734375, "completions/mean_terminated_length": 86.0793685913086, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.2144380140115745, "grad_norm": 2.92242431640625, "kl": 6.130859375, "learning_rate": 1.4702637582161761e-06, "loss": 0.3861, "num_tokens": 59717530.0, "reward": 1.9296875, "reward_std": 0.16071803867816925, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 109.390625, "completions/mean_terminated_length": 109.390625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 4.2156564118184585, "grad_norm": 1.6459993124008179, "kl": 4.45556640625, "learning_rate": 1.4658230373780668e-06, "loss": 0.2466, "num_tokens": 59731651.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 3457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 93.9375, "completions/mean_terminated_length": 93.9375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.216874809625343, "grad_norm": 0.7225376963615417, "kl": 1.56982421875, "learning_rate": 1.4613885025216602e-06, "loss": 0.0334, "num_tokens": 59744639.0, "reward": 1.91796875, "reward_std": 0.23201940953731537, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 100.625, "completions/mean_terminated_length": 100.625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.218093207432227, "grad_norm": 0.4800090491771698, "kl": 0.478515625, "learning_rate": 1.4569601568613146e-06, "loss": 0.0191, "num_tokens": 59757927.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 91.8125, "completions/mean_terminated_length": 91.8125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.219311605239111, "grad_norm": 0.19777846336364746, "kl": 0.4326171875, "learning_rate": 1.4525380036069103e-06, "loss": 0.0173, "num_tokens": 59770331.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 107.125, "completions/mean_terminated_length": 107.125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.220530003045995, "grad_norm": 0.13919298350811005, "kl": 0.37353515625, "learning_rate": 1.448122045963839e-06, "loss": 0.0149, "num_tokens": 59784531.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 105.71875, "completions/mean_terminated_length": 105.71875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.221748400852879, "grad_norm": 0.48354649543762207, "kl": 0.392578125, "learning_rate": 1.4437122871329956e-06, "loss": -0.0018, "num_tokens": 59798865.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 87.0625, "completions/mean_terminated_length": 87.0625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.222966798659763, "grad_norm": 0.21535994112491608, "kl": 0.4375, "learning_rate": 1.4393087303107866e-06, "loss": 0.0175, "num_tokens": 59810821.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 115.421875, "completions/mean_terminated_length": 115.421875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.224185196466646, "grad_norm": 0.48070570826530457, "kl": 0.8037109375, "learning_rate": 1.434911378689121e-06, "loss": 0.0023, "num_tokens": 59825896.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 96.125, "completions/mean_terminated_length": 96.125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.22540359427353, "grad_norm": 0.6753901839256287, "kl": 0.6552734375, "learning_rate": 1.4305202354554138e-06, "loss": 0.0244, "num_tokens": 59838872.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 106.890625, "completions/mean_terminated_length": 106.890625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.226621992080414, "grad_norm": 0.772754967212677, "kl": 0.68408203125, "learning_rate": 1.4261353037925717e-06, "loss": 0.0158, "num_tokens": 59852825.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 109.203125, "completions/mean_terminated_length": 109.203125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.227840389887298, "grad_norm": 1.1705390214920044, "kl": 0.99658203125, "learning_rate": 1.4217565868790073e-06, "loss": 0.0106, "num_tokens": 59866822.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 85.75, "completions/mean_terminated_length": 85.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.229058787694182, "grad_norm": 0.683925449848175, "kl": 1.29443359375, "learning_rate": 1.4173840878886235e-06, "loss": 0.0441, "num_tokens": 59878590.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 95.078125, "completions/mean_terminated_length": 95.078125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.230277185501066, "grad_norm": 0.1869799792766571, "kl": 0.42041015625, "learning_rate": 1.4130178099908187e-06, "loss": 0.0168, "num_tokens": 59891475.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 97.625, "completions/mean_terminated_length": 97.625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.23149558330795, "grad_norm": 1.275611400604248, "kl": 3.50146484375, "learning_rate": 1.4086577563504832e-06, "loss": 0.0995, "num_tokens": 59904923.0, "reward": 1.859375, "reward_std": 0.28258034586906433, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.2314550280570984, "step": 3470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 124.625, "completions/mean_terminated_length": 124.625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.232713981114834, "grad_norm": 1.5437270402908325, "kl": 3.75927734375, "learning_rate": 1.4043039301279904e-06, "loss": 0.1939, "num_tokens": 59921267.0, "reward": 1.8984375, "reward_std": 0.2872621417045593, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 97.265625, "completions/mean_terminated_length": 97.265625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.233932378921718, "grad_norm": 0.13778668642044067, "kl": 0.36962890625, "learning_rate": 1.3999563344792023e-06, "loss": 0.0148, "num_tokens": 59934236.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 102.515625, "completions/mean_terminated_length": 102.515625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.235150776728602, "grad_norm": 0.14032301306724548, "kl": 0.4765625, "learning_rate": 1.3956149725554657e-06, "loss": 0.0191, "num_tokens": 59947565.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 100.828125, "completions/mean_terminated_length": 100.828125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.2363691745354854, "grad_norm": 0.5787200331687927, "kl": 1.17724609375, "learning_rate": 1.3912798475036093e-06, "loss": 0.031, "num_tokens": 59961066.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 106.671875, "completions/mean_terminated_length": 106.671875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.2375875723423695, "grad_norm": 0.5441937446594238, "kl": 0.79345703125, "learning_rate": 1.3869509624659406e-06, "loss": 0.0103, "num_tokens": 59975221.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 118.390625, "completions/mean_terminated_length": 118.390625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 4.2388059701492535, "grad_norm": 0.9358147382736206, "kl": 1.50927734375, "learning_rate": 1.3826283205802427e-06, "loss": 0.0172, "num_tokens": 59990574.0, "reward": 1.9140625, "reward_std": 0.24306795001029968, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 3476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 107.171875, "completions/mean_terminated_length": 107.171875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 4.240024367956138, "grad_norm": 1.1787397861480713, "kl": 2.91162109375, "learning_rate": 1.378311924979776e-06, "loss": 0.1117, "num_tokens": 60004257.0, "reward": 1.90625, "reward_std": 0.2041158676147461, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 3477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 99.953125, "completions/mean_terminated_length": 99.953125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.241242765763022, "grad_norm": 1.077561378479004, "kl": 1.10205078125, "learning_rate": 1.374001778793268e-06, "loss": 0.0593, "num_tokens": 60017702.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 103.75, "completions/mean_terminated_length": 103.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.242461163569906, "grad_norm": 3.0469188690185547, "kl": 1.12646484375, "learning_rate": 1.3696978851449238e-06, "loss": 0.055, "num_tokens": 60031534.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 89.953125, "completions/mean_terminated_length": 89.953125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 4.24367956137679, "grad_norm": 0.8573034405708313, "kl": 0.97802734375, "learning_rate": 1.3654002471544093e-06, "loss": 0.0058, "num_tokens": 60043643.0, "reward": 1.953125, "reward_std": 0.09300297498703003, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 93.046875, "completions/mean_terminated_length": 93.046875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 4.244897959183674, "grad_norm": 2.597384214401245, "kl": 1.5810546875, "learning_rate": 1.361108867936859e-06, "loss": 0.0339, "num_tokens": 60056110.0, "reward": 1.921875, "reward_std": 0.22097085416316986, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1510545015335083, "step": 3481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 97.515625, "completions/mean_terminated_length": 97.515625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 4.246116356990558, "grad_norm": 0.7922524213790894, "kl": 1.3115234375, "learning_rate": 1.3568237506028736e-06, "loss": 0.056, "num_tokens": 60069175.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 82.96875, "completions/mean_terminated_length": 82.96875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.247334754797441, "grad_norm": 1.3087974786758423, "kl": 2.470703125, "learning_rate": 1.352544898258511e-06, "loss": 0.1274, "num_tokens": 60080237.0, "reward": 1.92578125, "reward_std": 0.16360090672969818, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 3483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 132.46875, "completions/mean_terminated_length": 132.46875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.248553152604325, "grad_norm": 1.1135637760162354, "kl": 2.708984375, "learning_rate": 1.3482723140052878e-06, "loss": 0.2069, "num_tokens": 60097155.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 136.15625, "completions/mean_terminated_length": 136.15625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.249771550411209, "grad_norm": 5.54084587097168, "kl": 1.87939453125, "learning_rate": 1.344006000940179e-06, "loss": 0.043, "num_tokens": 60114381.0, "reward": 1.875, "reward_std": 0.30150383710861206, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1883249133825302, "step": 3485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 105.125, "completions/mean_terminated_length": 105.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.250989948218093, "grad_norm": 0.42682430148124695, "kl": 0.408203125, "learning_rate": 1.339745962155613e-06, "loss": 0.0163, "num_tokens": 60128261.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 114.328125, "completions/mean_terminated_length": 114.328125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.252208346024977, "grad_norm": 2.5245726108551025, "kl": 2.2353515625, "learning_rate": 1.335492200739472e-06, "loss": 0.0848, "num_tokens": 60143242.0, "reward": 1.9296875, "reward_std": 0.16071803867816925, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 113.953125, "completions/mean_terminated_length": 113.953125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.253426743831861, "grad_norm": 1.4576308727264404, "kl": 2.111328125, "learning_rate": 1.331244719775089e-06, "loss": 0.1182, "num_tokens": 60157983.0, "reward": 1.89453125, "reward_std": 0.2519892454147339, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 3488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 106.453125, "completions/mean_terminated_length": 106.453125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.254645141638745, "grad_norm": 0.8772005438804626, "kl": 0.63037109375, "learning_rate": 1.327003522341237e-06, "loss": 0.0572, "num_tokens": 60171788.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 98.765625, "completions/mean_terminated_length": 98.765625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 4.255863539445629, "grad_norm": 1.064562201499939, "kl": 0.81298828125, "learning_rate": 1.322768611512144e-06, "loss": 0.0373, "num_tokens": 60185093.0, "reward": 1.9296875, "reward_std": 0.19887377321720123, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10652101784944534, "step": 3490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 93.03125, "completions/mean_terminated_length": 93.03125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.257081937252513, "grad_norm": 0.903628945350647, "kl": 1.02880859375, "learning_rate": 1.3185399903574724e-06, "loss": 0.03, "num_tokens": 60197375.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 118.859375, "completions/mean_terminated_length": 118.859375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.258300335059397, "grad_norm": 1.6201481819152832, "kl": 1.4853515625, "learning_rate": 1.3143176619423348e-06, "loss": 0.0544, "num_tokens": 60213702.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 99.0, "completions/mean_terminated_length": 99.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 4.2595187328662805, "grad_norm": 0.8657013177871704, "kl": 1.17431640625, "learning_rate": 1.3101016293272739e-06, "loss": 0.0459, "num_tokens": 60226406.0, "reward": 1.9296875, "reward_std": 0.16071803867816925, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 104.515625, "completions/mean_terminated_length": 104.515625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.2607371306731645, "grad_norm": 0.9379866719245911, "kl": 0.83349609375, "learning_rate": 1.3058918955682765e-06, "loss": 0.0117, "num_tokens": 60240471.0, "reward": 1.9453125, "reward_std": 0.15467961132526398, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 106.734375, "completions/mean_terminated_length": 106.734375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.2619555284800485, "grad_norm": 0.35345733165740967, "kl": 0.625, "learning_rate": 1.301688463716757e-06, "loss": -0.0005, "num_tokens": 60254774.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 95.671875, "completions/mean_terminated_length": 95.671875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.263173926286933, "grad_norm": 0.7569493651390076, "kl": 1.2548828125, "learning_rate": 1.2974913368195696e-06, "loss": 0.0304, "num_tokens": 60267409.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 101.265625, "completions/mean_terminated_length": 101.265625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.264392324093817, "grad_norm": 0.8996740579605103, "kl": 0.85400390625, "learning_rate": 1.2933005179189895e-06, "loss": 0.0422, "num_tokens": 60281122.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 3497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 106.265625, "completions/mean_terminated_length": 106.265625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.265610721900701, "grad_norm": 1.8834081888198853, "kl": 1.7158203125, "learning_rate": 1.2891160100527222e-06, "loss": 0.0989, "num_tokens": 60294883.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 96.96875, "completions/mean_terminated_length": 96.96875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 4.266829119707585, "grad_norm": 1.8734256029129028, "kl": 3.2919921875, "learning_rate": 1.2849378162539027e-06, "loss": 0.2021, "num_tokens": 60307833.0, "reward": 1.8515625, "reward_std": 0.4198446571826935, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 3499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 98.25, "completions/mean_terminated_length": 98.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.268047517514469, "grad_norm": 1.275524377822876, "kl": 2.38916015625, "learning_rate": 1.2807659395510863e-06, "loss": 0.0395, "num_tokens": 60321121.0, "reward": 1.87109375, "reward_std": 0.3307953476905823, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.21445617079734802, "step": 3500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 97.046875, "completions/mean_terminated_length": 97.046875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.269265915321353, "grad_norm": 1.2135107517242432, "kl": 1.22412109375, "learning_rate": 1.2766003829682504e-06, "loss": 0.0679, "num_tokens": 60334468.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 3501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 107.046875, "completions/mean_terminated_length": 107.046875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.270484313128236, "grad_norm": 0.8563994765281677, "kl": 0.91259765625, "learning_rate": 1.2724411495247913e-06, "loss": -0.0166, "num_tokens": 60349135.0, "reward": 1.91796875, "reward_std": 0.23201942443847656, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 93.515625, "completions/mean_terminated_length": 93.515625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.27170271093512, "grad_norm": 0.884811282157898, "kl": 0.7158203125, "learning_rate": 1.268288242235518e-06, "loss": 0.0052, "num_tokens": 60361920.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 93.453125, "completions/mean_terminated_length": 93.453125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.272921108742004, "grad_norm": 0.35663047432899475, "kl": 0.4033203125, "learning_rate": 1.2641416641106607e-06, "loss": 0.0152, "num_tokens": 60374813.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 93.046875, "completions/mean_terminated_length": 93.046875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.274139506548888, "grad_norm": 0.9730705618858337, "kl": 1.3623046875, "learning_rate": 1.2600014181558562e-06, "loss": 0.0733, "num_tokens": 60387576.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 100.0, "completions/mean_terminated_length": 100.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.275357904355772, "grad_norm": 0.7491042613983154, "kl": 0.83642578125, "learning_rate": 1.2558675073721572e-06, "loss": 0.0397, "num_tokens": 60400984.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 94.375, "completions/mean_terminated_length": 94.375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.276576302162656, "grad_norm": 0.8441370725631714, "kl": 1.2353515625, "learning_rate": 1.2517399347560165e-06, "loss": 0.0135, "num_tokens": 60413768.0, "reward": 1.93359375, "reward_std": 0.14150382578372955, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 94.390625, "completions/mean_terminated_length": 94.390625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 4.27779469996954, "grad_norm": 0.8819648623466492, "kl": 0.7939453125, "learning_rate": 1.2476187032992992e-06, "loss": -0.0179, "num_tokens": 60426553.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 106.875, "completions/mean_terminated_length": 106.875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.279013097776424, "grad_norm": 1.2419517040252686, "kl": 1.3486328125, "learning_rate": 1.2435038159892754e-06, "loss": 0.0177, "num_tokens": 60440609.0, "reward": 1.8984375, "reward_std": 0.24094071984291077, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16796371340751648, "step": 3509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 98.359375, "completions/mean_terminated_length": 98.359375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.280231495583308, "grad_norm": 0.7663846611976624, "kl": 1.818359375, "learning_rate": 1.2393952758086059e-06, "loss": 0.037, "num_tokens": 60453968.0, "reward": 1.91796875, "reward_std": 0.23201942443847656, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 106.015625, "completions/mean_terminated_length": 106.015625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.281449893390192, "grad_norm": 3.7500100135803223, "kl": 0.77197265625, "learning_rate": 1.235293085735364e-06, "loss": 0.0037, "num_tokens": 60467833.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 96.296875, "completions/mean_terminated_length": 96.296875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.2826682911970755, "grad_norm": 0.21382887661457062, "kl": 0.419921875, "learning_rate": 1.2311972487430113e-06, "loss": 0.0168, "num_tokens": 60481124.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 105.6875, "completions/mean_terminated_length": 105.6875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.2838866890039595, "grad_norm": 0.4898124635219574, "kl": 0.3935546875, "learning_rate": 1.2271077678004084e-06, "loss": 0.0162, "num_tokens": 60495448.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 104.734375, "completions/mean_terminated_length": 104.734375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.285105086810844, "grad_norm": 0.3785748779773712, "kl": 0.6484375, "learning_rate": 1.2230246458718098e-06, "loss": 0.0087, "num_tokens": 60510055.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 87.1875, "completions/mean_terminated_length": 87.1875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.286323484617728, "grad_norm": 1.2396132946014404, "kl": 2.04931640625, "learning_rate": 1.2189478859168546e-06, "loss": 0.0367, "num_tokens": 60522459.0, "reward": 1.87890625, "reward_std": 0.21275897324085236, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.23034733533859253, "step": 3515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 99.40625, "completions/mean_terminated_length": 99.40625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.287541882424612, "grad_norm": 2.033158302307129, "kl": 5.35791015625, "learning_rate": 1.2148774908905782e-06, "loss": 0.355, "num_tokens": 60535957.0, "reward": 1.86328125, "reward_std": 0.34854328632354736, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11016931384801865, "step": 3516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 93.0, "completions/mean_terminated_length": 93.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.288760280231496, "grad_norm": 3.3621108531951904, "kl": 1.443359375, "learning_rate": 1.2108134637433944e-06, "loss": 0.0868, "num_tokens": 60548565.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 90.734375, "completions/mean_terminated_length": 90.734375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.28997867803838, "grad_norm": 0.983676552772522, "kl": 1.904296875, "learning_rate": 1.2067558074211072e-06, "loss": 0.1172, "num_tokens": 60560812.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 98.953125, "completions/mean_terminated_length": 98.953125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 4.291197075845264, "grad_norm": 0.790442943572998, "kl": 1.1083984375, "learning_rate": 1.2027045248649016e-06, "loss": -0.0263, "num_tokens": 60574817.0, "reward": 1.90625, "reward_std": 0.2041158676147461, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 3519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 107.328125, "completions/mean_terminated_length": 107.328125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.292415473652148, "grad_norm": 0.8159535527229309, "kl": 1.52734375, "learning_rate": 1.198659619011343e-06, "loss": 0.0422, "num_tokens": 60589222.0, "reward": 1.93359375, "reward_std": 0.18782523274421692, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 110.25, "completions/mean_terminated_length": 110.25, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 4.293633871459031, "grad_norm": 1.4664827585220337, "kl": 3.76953125, "learning_rate": 1.1946210927923729e-06, "loss": 0.263, "num_tokens": 60603806.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 91.703125, "completions/mean_terminated_length": 91.703125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 4.294852269265915, "grad_norm": 0.6504628658294678, "kl": 0.7041015625, "learning_rate": 1.1905889491353073e-06, "loss": 0.0246, "num_tokens": 60616491.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 94.78125, "completions/mean_terminated_length": 94.78125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 4.296070667072799, "grad_norm": 0.771757960319519, "kl": 0.7421875, "learning_rate": 1.1865631909628407e-06, "loss": 0.0039, "num_tokens": 60629381.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 82.78125, "completions/mean_terminated_length": 82.78125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.297289064879683, "grad_norm": 1.371614694595337, "kl": 1.44921875, "learning_rate": 1.1825438211930328e-06, "loss": 0.0892, "num_tokens": 60641487.0, "reward": 1.9453125, "reward_std": 0.15467961132526398, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 85.71875, "completions/mean_terminated_length": 85.71875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 4.298507462686567, "grad_norm": 1.2174079418182373, "kl": 2.74853515625, "learning_rate": 1.1785308427393182e-06, "loss": 0.1492, "num_tokens": 60653485.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 115.96875, "completions/mean_terminated_length": 115.96875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 4.299725860493451, "grad_norm": 0.8412409424781799, "kl": 1.4404296875, "learning_rate": 1.1745242585104954e-06, "loss": 0.0376, "num_tokens": 60668771.0, "reward": 1.92578125, "reward_std": 0.16838817298412323, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 3526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 86.828125, "completions/mean_terminated_length": 86.828125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.300944258300335, "grad_norm": 2.170442581176758, "kl": 1.1376953125, "learning_rate": 1.1705240714107301e-06, "loss": 0.0148, "num_tokens": 60680688.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 98.984375, "completions/mean_terminated_length": 98.984375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.302162656107219, "grad_norm": 1.0832470655441284, "kl": 1.6728515625, "learning_rate": 1.1665302843395521e-06, "loss": 0.0518, "num_tokens": 60694287.0, "reward": 1.953125, "reward_std": 0.09300297498703003, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 129.25, "completions/mean_terminated_length": 129.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.303381053914103, "grad_norm": 5.314535140991211, "kl": 2.15673828125, "learning_rate": 1.1625429001918465e-06, "loss": 0.0662, "num_tokens": 60711871.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 102.046875, "completions/mean_terminated_length": 102.046875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.304599451720987, "grad_norm": 0.557988703250885, "kl": 0.82177734375, "learning_rate": 1.1585619218578625e-06, "loss": 0.0271, "num_tokens": 60725730.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 99.109375, "completions/mean_terminated_length": 99.109375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.3058178495278705, "grad_norm": 0.7533594369888306, "kl": 0.94287109375, "learning_rate": 1.1545873522232055e-06, "loss": 0.0419, "num_tokens": 60739385.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 99.828125, "completions/mean_terminated_length": 99.828125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.3070362473347545, "grad_norm": 0.6365646123886108, "kl": 0.36767578125, "learning_rate": 1.1506191941688361e-06, "loss": 0.0218, "num_tokens": 60753094.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 99.40625, "completions/mean_terminated_length": 99.40625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 4.308254645141639, "grad_norm": 0.7991878390312195, "kl": 0.58544921875, "learning_rate": 1.1466574505710603e-06, "loss": 0.0205, "num_tokens": 60766728.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 97.640625, "completions/mean_terminated_length": 97.640625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.309473042948523, "grad_norm": 1.6051174402236938, "kl": 1.58056640625, "learning_rate": 1.1427021243015469e-06, "loss": 0.1414, "num_tokens": 60780009.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 86.15625, "completions/mean_terminated_length": 86.15625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.310691440755407, "grad_norm": 1.2005785703659058, "kl": 2.1064453125, "learning_rate": 1.138753218227302e-06, "loss": 0.0984, "num_tokens": 60792083.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 3535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 86.671875, "completions/mean_terminated_length": 86.671875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.311909838562291, "grad_norm": 1.4421600103378296, "kl": 1.47265625, "learning_rate": 1.134810735210683e-06, "loss": 0.0671, "num_tokens": 60804182.0, "reward": 1.9609375, "reward_std": 0.07232969254255295, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 89.59375, "completions/mean_terminated_length": 89.59375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 4.313128236369175, "grad_norm": 3.342529296875, "kl": 3.21044921875, "learning_rate": 1.1308746781093927e-06, "loss": 0.1307, "num_tokens": 60816700.0, "reward": 1.890625, "reward_std": 0.30935919284820557, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.19352105259895325, "step": 3537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 115.625, "completions/mean_terminated_length": 115.625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.314346634176059, "grad_norm": 0.2542131841182709, "kl": 0.388671875, "learning_rate": 1.1269450497764734e-06, "loss": 0.0155, "num_tokens": 60832140.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 87.0, "completions/mean_terminated_length": 87.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.315565031982943, "grad_norm": 0.8223565220832825, "kl": 0.66943359375, "learning_rate": 1.1230218530603088e-06, "loss": 0.017, "num_tokens": 60844108.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 102.625, "completions/mean_terminated_length": 102.625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 4.316783429789826, "grad_norm": 1.0699065923690796, "kl": 2.28466796875, "learning_rate": 1.1191050908046242e-06, "loss": 0.1067, "num_tokens": 60858148.0, "reward": 1.921875, "reward_std": 0.22097086906433105, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 3540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 97.28125, "completions/mean_terminated_length": 97.28125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.31800182759671, "grad_norm": 1.055005669593811, "kl": 1.5205078125, "learning_rate": 1.115194765848473e-06, "loss": 0.0569, "num_tokens": 60871342.0, "reward": 1.953125, "reward_std": 0.09300297498703003, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 89.359375, "completions/mean_terminated_length": 89.359375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.319220225403594, "grad_norm": 0.14931435883045197, "kl": 0.3759765625, "learning_rate": 1.1112908810262446e-06, "loss": 0.015, "num_tokens": 60883797.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 78.875, "completions/mean_terminated_length": 78.875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.320438623210478, "grad_norm": 0.921548068523407, "kl": 0.9931640625, "learning_rate": 1.1073934391676666e-06, "loss": -0.0082, "num_tokens": 60895613.0, "reward": 1.91796875, "reward_std": 0.23201942443847656, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 97.578125, "completions/mean_terminated_length": 97.578125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 4.321657021017362, "grad_norm": 0.8994899988174438, "kl": 1.46240234375, "learning_rate": 1.1035024430977903e-06, "loss": 0.0358, "num_tokens": 60908746.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 99.296875, "completions/mean_terminated_length": 99.296875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.322875418824246, "grad_norm": 0.5713149905204773, "kl": 1.00732421875, "learning_rate": 1.0996178956369974e-06, "loss": 0.0339, "num_tokens": 60922101.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 87.71875, "completions/mean_terminated_length": 87.71875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.32409381663113, "grad_norm": 0.19258014857769012, "kl": 0.42822265625, "learning_rate": 1.0957397996009934e-06, "loss": 0.0171, "num_tokens": 60934291.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 93.5, "completions/mean_terminated_length": 93.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.325312214438014, "grad_norm": 0.590592086315155, "kl": 0.6748046875, "learning_rate": 1.0918681578008128e-06, "loss": 0.0299, "num_tokens": 60947027.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 90.765625, "completions/mean_terminated_length": 90.765625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.326530612244898, "grad_norm": 0.5634415745735168, "kl": 1.06787109375, "learning_rate": 1.0880029730428021e-06, "loss": 0.0264, "num_tokens": 60959548.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 88.171875, "completions/mean_terminated_length": 88.171875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.327749010051782, "grad_norm": 1.465061068534851, "kl": 1.6240234375, "learning_rate": 1.084144248128637e-06, "loss": 0.068, "num_tokens": 60971655.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 3549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 93.578125, "completions/mean_terminated_length": 93.578125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.3289674078586655, "grad_norm": 2.171049118041992, "kl": 1.49365234375, "learning_rate": 1.0802919858553062e-06, "loss": 0.026, "num_tokens": 60984020.0, "reward": 1.9140625, "reward_std": 0.19101837277412415, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 3550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 88.515625, "completions/mean_terminated_length": 88.515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 4.33018580566555, "grad_norm": 1.30521821975708, "kl": 2.78076171875, "learning_rate": 1.0764461890151112e-06, "loss": 0.1041, "num_tokens": 60996373.0, "reward": 1.88671875, "reward_std": 0.32040777802467346, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.21445617079734802, "step": 3551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 84.578125, "completions/mean_terminated_length": 84.578125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.331404203472434, "grad_norm": 0.6208963990211487, "kl": 0.4267578125, "learning_rate": 1.072606860395674e-06, "loss": 0.0029, "num_tokens": 61008802.0, "reward": 1.9609375, "reward_std": 0.07232969254255295, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 88.71875, "completions/mean_terminated_length": 88.71875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.332622601279318, "grad_norm": 0.7330784201622009, "kl": 1.166015625, "learning_rate": 1.0687740027799255e-06, "loss": 0.0563, "num_tokens": 61021120.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 112.984375, "completions/mean_terminated_length": 112.984375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.333840999086202, "grad_norm": 0.3700559437274933, "kl": 0.40625, "learning_rate": 1.0649476189461005e-06, "loss": -0.0097, "num_tokens": 61035927.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 101.53125, "completions/mean_terminated_length": 101.53125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.335059396893086, "grad_norm": 0.5084562301635742, "kl": 0.54541015625, "learning_rate": 1.0611277116677488e-06, "loss": 0.0129, "num_tokens": 61049833.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 83.71875, "completions/mean_terminated_length": 83.71875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.33627779469997, "grad_norm": 2.3896989822387695, "kl": 1.5595703125, "learning_rate": 1.057314283713723e-06, "loss": 0.0714, "num_tokens": 61061911.0, "reward": 1.94921875, "reward_std": 0.13169018924236298, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 100.6875, "completions/mean_terminated_length": 100.6875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.337496192506854, "grad_norm": 0.19274786114692688, "kl": 0.376953125, "learning_rate": 1.0535073378481785e-06, "loss": 0.0151, "num_tokens": 61075299.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 97.375, "completions/mean_terminated_length": 97.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.338714590313738, "grad_norm": 0.6605406999588013, "kl": 0.8232421875, "learning_rate": 1.0497068768305718e-06, "loss": -0.0003, "num_tokens": 61088403.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 113.8125, "completions/mean_terminated_length": 113.8125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.339932988120621, "grad_norm": 1.8487907648086548, "kl": 1.66796875, "learning_rate": 1.0459129034156646e-06, "loss": 0.0383, "num_tokens": 61102799.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 93.015625, "completions/mean_terminated_length": 93.015625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 4.341151385927505, "grad_norm": 0.9822424054145813, "kl": 1.4736328125, "learning_rate": 1.0421254203535058e-06, "loss": 0.034, "num_tokens": 61115776.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 116.625, "completions/mean_terminated_length": 116.625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 4.342369783734389, "grad_norm": 1.3448255062103271, "kl": 2.2529296875, "learning_rate": 1.0383444303894453e-06, "loss": 0.1095, "num_tokens": 61130824.0, "reward": 1.9453125, "reward_std": 0.15467961132526398, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 73.203125, "completions/mean_terminated_length": 73.203125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.343588181541273, "grad_norm": 1.4153398275375366, "kl": 1.58642578125, "learning_rate": 1.0345699362641271e-06, "loss": 0.0494, "num_tokens": 61141669.0, "reward": 1.9453125, "reward_std": 0.12087303400039673, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 97.375, "completions/mean_terminated_length": 97.375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.344806579348157, "grad_norm": 0.6072065234184265, "kl": 1.0224609375, "learning_rate": 1.0308019407134872e-06, "loss": -0.0195, "num_tokens": 61154917.0, "reward": 1.90625, "reward_std": 0.2041158676147461, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 3563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 96.671875, "completions/mean_terminated_length": 96.671875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.346024977155041, "grad_norm": 0.9194979071617126, "kl": 1.75390625, "learning_rate": 1.02704044646875e-06, "loss": 0.0444, "num_tokens": 61167840.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 90.984375, "completions/mean_terminated_length": 90.984375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.347243374961925, "grad_norm": 1.2006375789642334, "kl": 1.4384765625, "learning_rate": 1.0232854562564276e-06, "loss": 0.064, "num_tokens": 61180247.0, "reward": 1.953125, "reward_std": 0.09300297498703003, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 100.671875, "completions/mean_terminated_length": 100.671875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.348461772768809, "grad_norm": 0.5918007493019104, "kl": 0.7265625, "learning_rate": 1.019536972798315e-06, "loss": 0.0099, "num_tokens": 61193962.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 102.671875, "completions/mean_terminated_length": 102.671875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.349680170575693, "grad_norm": 0.7196965217590332, "kl": 1.52197265625, "learning_rate": 1.0157949988114969e-06, "loss": 0.0726, "num_tokens": 61207861.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 107.890625, "completions/mean_terminated_length": 107.890625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.350898568382577, "grad_norm": 0.17505121231079102, "kl": 0.42236328125, "learning_rate": 1.012059537008332e-06, "loss": 0.0169, "num_tokens": 61222102.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 103.75, "completions/mean_terminated_length": 103.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.3521169661894605, "grad_norm": 0.18381181359291077, "kl": 0.427734375, "learning_rate": 1.0083305900964634e-06, "loss": 0.0171, "num_tokens": 61235806.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 92.671875, "completions/mean_terminated_length": 92.671875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 4.353335363996345, "grad_norm": 1.6901808977127075, "kl": 2.93798828125, "learning_rate": 1.0046081607788127e-06, "loss": 0.1329, "num_tokens": 61248377.0, "reward": 1.890625, "reward_std": 0.26782506704330444, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 3570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 104.953125, "completions/mean_terminated_length": 104.953125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.354553761803229, "grad_norm": 1.312945008277893, "kl": 1.80029296875, "learning_rate": 1.0008922517535747e-06, "loss": 0.0616, "num_tokens": 61262390.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 110.5, "completions/mean_terminated_length": 110.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.355772159610113, "grad_norm": 1.1006814241409302, "kl": 2.2705078125, "learning_rate": 9.971828657142202e-07, "loss": 0.0924, "num_tokens": 61277134.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 107.265625, "completions/mean_terminated_length": 107.265625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.356990557416997, "grad_norm": 1.8206812143325806, "kl": 3.84521484375, "learning_rate": 9.934800053494886e-07, "loss": 0.1629, "num_tokens": 61291383.0, "reward": 1.8515625, "reward_std": 0.30674588680267334, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.2498759627342224, "step": 3573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 89.078125, "completions/mean_terminated_length": 89.078125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 4.358208955223881, "grad_norm": 0.8536443710327148, "kl": 1.1279296875, "learning_rate": 9.897836733433918e-07, "loss": 0.0275, "num_tokens": 61303436.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 90.734375, "completions/mean_terminated_length": 90.734375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.359427353030765, "grad_norm": 0.4734104871749878, "kl": 0.62353515625, "learning_rate": 9.860938723752078e-07, "loss": 0.019, "num_tokens": 61316083.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 91.15625, "completions/mean_terminated_length": 91.15625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.360645750837649, "grad_norm": 0.32597312331199646, "kl": 0.38818359375, "learning_rate": 9.82410605119486e-07, "loss": 0.0155, "num_tokens": 61328661.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 98.859375, "completions/mean_terminated_length": 98.859375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.361864148644532, "grad_norm": 0.7613440752029419, "kl": 0.89453125, "learning_rate": 9.787338742460306e-07, "loss": 0.0382, "num_tokens": 61342156.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 120.40625, "completions/mean_terminated_length": 106.0634994506836, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.363082546451416, "grad_norm": 2.097011089324951, "kl": 7.3154296875, "learning_rate": 9.750636824199144e-07, "loss": 0.5453, "num_tokens": 61356766.0, "reward": 1.9375, "reward_std": 0.13865739107131958, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 3578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 94.1875, "completions/mean_terminated_length": 94.1875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.3643009442583, "grad_norm": 0.7616081237792969, "kl": 0.8837890625, "learning_rate": 9.714000323014705e-07, "loss": 0.0356, "num_tokens": 61369690.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 95.40625, "completions/mean_terminated_length": 95.40625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.365519342065184, "grad_norm": 0.45101696252822876, "kl": 0.7841796875, "learning_rate": 9.677429265462868e-07, "loss": 0.021, "num_tokens": 61382324.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 118.65625, "completions/mean_terminated_length": 118.65625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 4.366737739872068, "grad_norm": 1.3252956867218018, "kl": 3.67041015625, "learning_rate": 9.640923678052094e-07, "loss": 0.2464, "num_tokens": 61397414.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 108.484375, "completions/mean_terminated_length": 108.484375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.367956137678952, "grad_norm": 0.8531728982925415, "kl": 0.53466796875, "learning_rate": 9.604483587243386e-07, "loss": 0.027, "num_tokens": 61411693.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 92.078125, "completions/mean_terminated_length": 92.078125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.369174535485836, "grad_norm": 0.5085017681121826, "kl": 0.4228515625, "learning_rate": 9.568109019450278e-07, "loss": 0.0036, "num_tokens": 61424330.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 110.828125, "completions/mean_terminated_length": 110.828125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.37039293329272, "grad_norm": 0.6134472489356995, "kl": 0.81787109375, "learning_rate": 9.531800001038837e-07, "loss": 0.0427, "num_tokens": 61438911.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 112.734375, "completions/mean_terminated_length": 112.734375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 4.371611331099604, "grad_norm": 0.3735889792442322, "kl": 0.3603515625, "learning_rate": 9.495556558327546e-07, "loss": -0.0099, "num_tokens": 61454086.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 97.5, "completions/mean_terminated_length": 97.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.372829728906488, "grad_norm": 0.1479308307170868, "kl": 0.39404296875, "learning_rate": 9.459378717587386e-07, "loss": 0.0157, "num_tokens": 61467550.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 96.234375, "completions/mean_terminated_length": 96.234375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.3740481267133715, "grad_norm": 0.670125424861908, "kl": 1.23291015625, "learning_rate": 9.423266505041817e-07, "loss": 0.0811, "num_tokens": 61480413.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 101.203125, "completions/mean_terminated_length": 101.203125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 4.3752665245202556, "grad_norm": 0.1316385716199875, "kl": 0.38720703125, "learning_rate": 9.387219946866699e-07, "loss": 0.0155, "num_tokens": 61494154.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 101.828125, "completions/mean_terminated_length": 87.19048309326172, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.37648492232714, "grad_norm": 1.7856134176254272, "kl": 5.69677734375, "learning_rate": 9.35123906919031e-07, "loss": 0.4718, "num_tokens": 61506903.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 121.328125, "completions/mean_terminated_length": 121.328125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 4.377703320134024, "grad_norm": 1.8641419410705566, "kl": 2.32470703125, "learning_rate": 9.315323898093332e-07, "loss": 0.1038, "num_tokens": 61522484.0, "reward": 1.9296875, "reward_std": 0.16071803867816925, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 98.515625, "completions/mean_terminated_length": 98.515625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 4.378921717940908, "grad_norm": 2.3247854709625244, "kl": 1.69873046875, "learning_rate": 9.279474459608806e-07, "loss": 0.0317, "num_tokens": 61535917.0, "reward": 1.91796875, "reward_std": 0.23201942443847656, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 105.546875, "completions/mean_terminated_length": 105.546875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.380140115747792, "grad_norm": 1.1071226596832275, "kl": 2.56494140625, "learning_rate": 9.243690779722114e-07, "loss": 0.1211, "num_tokens": 61550160.0, "reward": 1.91796875, "reward_std": 0.19821283221244812, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 3592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 92.40625, "completions/mean_terminated_length": 92.40625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 4.381358513554676, "grad_norm": 0.3192826211452484, "kl": 0.54736328125, "learning_rate": 9.207972884370986e-07, "loss": 0.0028, "num_tokens": 61562402.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 106.21875, "completions/mean_terminated_length": 106.21875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.38257691136156, "grad_norm": 1.1691882610321045, "kl": 2.37841796875, "learning_rate": 9.1723207994455e-07, "loss": 0.0777, "num_tokens": 61576296.0, "reward": 1.91015625, "reward_std": 0.2541165053844452, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.19697457551956177, "step": 3594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 104.234375, "completions/mean_terminated_length": 104.234375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.383795309168444, "grad_norm": 0.5837779641151428, "kl": 1.23583984375, "learning_rate": 9.136734550787951e-07, "loss": 0.0133, "num_tokens": 61590247.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 105.375, "completions/mean_terminated_length": 105.375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.385013706975327, "grad_norm": 0.4644639790058136, "kl": 0.4931640625, "learning_rate": 9.101214164192995e-07, "loss": 0.018, "num_tokens": 61604239.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 105.8125, "completions/mean_terminated_length": 105.8125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.386232104782211, "grad_norm": 1.0925476551055908, "kl": 1.50439453125, "learning_rate": 9.065759665407514e-07, "loss": -0.0103, "num_tokens": 61618443.0, "reward": 1.88671875, "reward_std": 0.27887362241744995, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 3597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 97.046875, "completions/mean_terminated_length": 97.046875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.387450502589095, "grad_norm": 11.542875289916992, "kl": 3.9970703125, "learning_rate": 9.030371080130651e-07, "loss": 0.2351, "num_tokens": 61631190.0, "reward": 1.91796875, "reward_std": 0.20385757088661194, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1597815304994583, "step": 3598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 109.90625, "completions/mean_terminated_length": 109.90625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.388668900395979, "grad_norm": 0.6588073968887329, "kl": 1.39111328125, "learning_rate": 8.995048434013709e-07, "loss": 0.0608, "num_tokens": 61645592.0, "reward": 1.93359375, "reward_std": 0.15401867032051086, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 98.671875, "completions/mean_terminated_length": 98.671875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.389887298202863, "grad_norm": 1.2770130634307861, "kl": 0.9287109375, "learning_rate": 8.959791752660263e-07, "loss": 0.0458, "num_tokens": 61658499.0, "reward": 1.9453125, "reward_std": 0.15467961132526398, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 93.265625, "completions/mean_terminated_length": 93.265625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 4.391105696009747, "grad_norm": 0.7930905818939209, "kl": 1.30859375, "learning_rate": 8.924601061626049e-07, "loss": 0.0012, "num_tokens": 61671252.0, "reward": 1.91796875, "reward_std": 0.23201940953731537, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 103.625, "completions/mean_terminated_length": 103.625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.392324093816631, "grad_norm": 0.42459940910339355, "kl": 0.66064453125, "learning_rate": 8.889476386418983e-07, "loss": 0.0247, "num_tokens": 61684684.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 94.703125, "completions/mean_terminated_length": 94.703125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.393542491623515, "grad_norm": 0.8010783195495605, "kl": 1.0869140625, "learning_rate": 8.854417752499112e-07, "loss": 0.0424, "num_tokens": 61697377.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 103.921875, "completions/mean_terminated_length": 103.921875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.394760889430399, "grad_norm": 0.7114661335945129, "kl": 0.94091796875, "learning_rate": 8.819425185278629e-07, "loss": 0.0225, "num_tokens": 61711348.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 102.859375, "completions/mean_terminated_length": 102.859375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 4.395979287237283, "grad_norm": 0.14820027351379395, "kl": 0.36669921875, "learning_rate": 8.784498710121792e-07, "loss": 0.0146, "num_tokens": 61724219.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 94.640625, "completions/mean_terminated_length": 94.640625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.3971976850441665, "grad_norm": 1.744950771331787, "kl": 2.0205078125, "learning_rate": 8.749638352345002e-07, "loss": 0.1392, "num_tokens": 61736372.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 107.921875, "completions/mean_terminated_length": 107.921875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.398416082851051, "grad_norm": 0.6612001657485962, "kl": 0.53369140625, "learning_rate": 8.714844137216749e-07, "loss": 0.0198, "num_tokens": 61750783.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 109.25, "completions/mean_terminated_length": 109.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.399634480657935, "grad_norm": 0.9245343804359436, "kl": 2.5810546875, "learning_rate": 8.680116089957524e-07, "loss": 0.1513, "num_tokens": 61764183.0, "reward": 1.9453125, "reward_std": 0.12087303400039673, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 109.734375, "completions/mean_terminated_length": 109.734375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.400852878464819, "grad_norm": 1.4474225044250488, "kl": 1.96533203125, "learning_rate": 8.645454235739903e-07, "loss": 0.0836, "num_tokens": 61778830.0, "reward": 1.91015625, "reward_std": 0.21596075594425201, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 3609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 94.765625, "completions/mean_terminated_length": 94.765625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.402071276271703, "grad_norm": 0.896158754825592, "kl": 1.3408203125, "learning_rate": 8.610858599688477e-07, "loss": 0.0856, "num_tokens": 61790871.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 105.875, "completions/mean_terminated_length": 105.875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 4.403289674078587, "grad_norm": 0.9617887735366821, "kl": 1.90283203125, "learning_rate": 8.576329206879785e-07, "loss": 0.0983, "num_tokens": 61804711.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 106.28125, "completions/mean_terminated_length": 106.28125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.404508071885471, "grad_norm": 0.1002357006072998, "kl": 0.353515625, "learning_rate": 8.541866082342432e-07, "loss": 0.0141, "num_tokens": 61818601.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 138.234375, "completions/mean_terminated_length": 138.234375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.405726469692355, "grad_norm": 1.2908902168273926, "kl": 2.65673828125, "learning_rate": 8.50746925105691e-07, "loss": 0.1497, "num_tokens": 61836232.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 101.140625, "completions/mean_terminated_length": 101.140625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.406944867499239, "grad_norm": 0.6903200149536133, "kl": 1.30712890625, "learning_rate": 8.473138737955722e-07, "loss": 0.0426, "num_tokens": 61849529.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 118.609375, "completions/mean_terminated_length": 118.609375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.408163265306122, "grad_norm": 0.3232486844062805, "kl": 0.5224609375, "learning_rate": 8.438874567923261e-07, "loss": 0.0189, "num_tokens": 61864800.0, "reward": 1.97265625, "reward_std": 0.07733980566263199, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 3615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 94.734375, "completions/mean_terminated_length": 94.734375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.409381663113006, "grad_norm": 1.2519081830978394, "kl": 1.47509765625, "learning_rate": 8.404676765795861e-07, "loss": 0.0776, "num_tokens": 61877399.0, "reward": 1.94921875, "reward_std": 0.14363105595111847, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 3616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 93.921875, "completions/mean_terminated_length": 93.921875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.41060006091989, "grad_norm": 0.5538948178291321, "kl": 0.49658203125, "learning_rate": 8.370545356361714e-07, "loss": 0.0198, "num_tokens": 61889826.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 105.75, "completions/mean_terminated_length": 105.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.411818458726774, "grad_norm": 0.47761210799217224, "kl": 0.58056640625, "learning_rate": 8.33648036436091e-07, "loss": 0.0174, "num_tokens": 61903322.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 88.90625, "completions/mean_terminated_length": 88.90625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.413036856533658, "grad_norm": 0.13218030333518982, "kl": 0.44384765625, "learning_rate": 8.302481814485397e-07, "loss": 0.0177, "num_tokens": 61915372.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 89.9375, "completions/mean_terminated_length": 89.9375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 4.414255254340542, "grad_norm": 0.9763673543930054, "kl": 1.27294921875, "learning_rate": 8.268549731378972e-07, "loss": 0.1116, "num_tokens": 61927304.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 110.46875, "completions/mean_terminated_length": 110.46875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.415473652147426, "grad_norm": 0.9820052981376648, "kl": 2.45068359375, "learning_rate": 8.234684139637205e-07, "loss": 0.1647, "num_tokens": 61941766.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 100.109375, "completions/mean_terminated_length": 100.109375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 4.41669204995431, "grad_norm": 0.20761704444885254, "kl": 0.38623046875, "learning_rate": 8.200885063807529e-07, "loss": 0.0155, "num_tokens": 61955221.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 94.625, "completions/mean_terminated_length": 94.625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.417910447761194, "grad_norm": 0.8021084666252136, "kl": 1.4521484375, "learning_rate": 8.167152528389155e-07, "loss": 0.0719, "num_tokens": 61968085.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 125.65625, "completions/mean_terminated_length": 111.39683532714844, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.419128845568078, "grad_norm": 2.1975033283233643, "kl": 0.49951171875, "learning_rate": 8.133486557833015e-07, "loss": 0.1788, "num_tokens": 61983031.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 99.234375, "completions/mean_terminated_length": 99.234375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.4203472433749615, "grad_norm": 0.5400899052619934, "kl": 0.71875, "learning_rate": 8.099887176541832e-07, "loss": 0.0237, "num_tokens": 61996014.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 113.9375, "completions/mean_terminated_length": 113.9375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 4.421565641181846, "grad_norm": 2.0928425788879395, "kl": 7.2021484375, "learning_rate": 8.066354408870047e-07, "loss": 0.5119, "num_tokens": 62010706.0, "reward": 1.89453125, "reward_std": 0.29831066727638245, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.19697457551956177, "step": 3626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 105.96875, "completions/mean_terminated_length": 105.96875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.42278403898873, "grad_norm": 0.7892974019050598, "kl": 1.85546875, "learning_rate": 8.032888279123829e-07, "loss": 0.1329, "num_tokens": 62024168.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 102.71875, "completions/mean_terminated_length": 102.71875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.424002436795614, "grad_norm": 0.18530409038066864, "kl": 0.3798828125, "learning_rate": 7.999488811561051e-07, "loss": 0.0152, "num_tokens": 62037414.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 120.734375, "completions/mean_terminated_length": 120.734375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 4.425220834602498, "grad_norm": 1.062575101852417, "kl": 1.509765625, "learning_rate": 7.966156030391247e-07, "loss": 0.0427, "num_tokens": 62052821.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 3629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 105.625, "completions/mean_terminated_length": 105.625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 4.426439232409382, "grad_norm": 1.237923502922058, "kl": 2.3701171875, "learning_rate": 7.932889959775613e-07, "loss": 0.1187, "num_tokens": 62066837.0, "reward": 1.9453125, "reward_std": 0.15467961132526398, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 104.34375, "completions/mean_terminated_length": 104.34375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.427657630216266, "grad_norm": 0.8427834510803223, "kl": 1.02880859375, "learning_rate": 7.899690623826983e-07, "loss": 0.051, "num_tokens": 62080379.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 94.453125, "completions/mean_terminated_length": 94.453125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.42887602802315, "grad_norm": 4.039230823516846, "kl": 1.57470703125, "learning_rate": 7.866558046609851e-07, "loss": 0.0656, "num_tokens": 62093056.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 91.078125, "completions/mean_terminated_length": 91.078125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.430094425830034, "grad_norm": 0.21357256174087524, "kl": 0.416015625, "learning_rate": 7.833492252140284e-07, "loss": 0.0167, "num_tokens": 62105301.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 116.90625, "completions/mean_terminated_length": 116.90625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 4.431312823636917, "grad_norm": 1.2168408632278442, "kl": 1.435546875, "learning_rate": 7.800493264385967e-07, "loss": -0.0193, "num_tokens": 62120343.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 3634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 85.9375, "completions/mean_terminated_length": 85.9375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.432531221443801, "grad_norm": 1.1222944259643555, "kl": 1.07275390625, "learning_rate": 7.767561107266175e-07, "loss": -0.0143, "num_tokens": 62131939.0, "reward": 1.92578125, "reward_std": 0.20992231369018555, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 3635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 100.234375, "completions/mean_terminated_length": 100.234375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.433749619250685, "grad_norm": 1.0536308288574219, "kl": 0.81103515625, "learning_rate": 7.734695804651693e-07, "loss": -0.0167, "num_tokens": 62144794.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1550549566745758, "step": 3636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 102.1875, "completions/mean_terminated_length": 102.1875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.434968017057569, "grad_norm": 0.5614926815032959, "kl": 0.9140625, "learning_rate": 7.701897380364865e-07, "loss": 0.0318, "num_tokens": 62158670.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 107.8125, "completions/mean_terminated_length": 107.8125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.436186414864453, "grad_norm": 3.9713778495788574, "kl": 5.47607421875, "learning_rate": 7.669165858179594e-07, "loss": 0.3518, "num_tokens": 62172386.0, "reward": 1.89453125, "reward_std": 0.29831066727638245, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18662993609905243, "step": 3638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 95.140625, "completions/mean_terminated_length": 95.140625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 4.437404812671337, "grad_norm": 0.9462157487869263, "kl": 1.3076171875, "learning_rate": 7.636501261821239e-07, "loss": 0.0406, "num_tokens": 62184771.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 108.69841766357422, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.438623210478221, "grad_norm": 1.836883544921875, "kl": 4.53271484375, "learning_rate": 7.603903614966668e-07, "loss": 0.4701, "num_tokens": 62199219.0, "reward": 1.87890625, "reward_std": 0.24209867417812347, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16993631422519684, "step": 3640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 103.953125, "completions/mean_terminated_length": 103.953125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.439841608285105, "grad_norm": 1.5208356380462646, "kl": 2.451171875, "learning_rate": 7.571372941244237e-07, "loss": 0.1211, "num_tokens": 62212400.0, "reward": 1.8984375, "reward_std": 0.24094071984291077, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.17938801646232605, "step": 3641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 118.171875, "completions/mean_terminated_length": 118.171875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.441060006091989, "grad_norm": 0.5629410743713379, "kl": 0.63525390625, "learning_rate": 7.538909264233751e-07, "loss": -0.0022, "num_tokens": 62227315.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 127.328125, "completions/mean_terminated_length": 127.328125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 4.442278403898873, "grad_norm": 3.786069631576538, "kl": 6.333984375, "learning_rate": 7.50651260746641e-07, "loss": 0.3055, "num_tokens": 62243112.0, "reward": 1.8359375, "reward_std": 0.3308935761451721, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 3643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 95.578125, "completions/mean_terminated_length": 95.578125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.443496801705757, "grad_norm": 0.14891505241394043, "kl": 0.39794921875, "learning_rate": 7.474182994424883e-07, "loss": 0.0159, "num_tokens": 62256085.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 123.625, "completions/mean_terminated_length": 123.625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.444715199512641, "grad_norm": 1.3261059522628784, "kl": 3.6162109375, "learning_rate": 7.441920448543238e-07, "loss": 0.2063, "num_tokens": 62271373.0, "reward": 1.921875, "reward_std": 0.18281513452529907, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 3645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 127.21875, "completions/mean_terminated_length": 127.21875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.445933597319525, "grad_norm": 0.9470688104629517, "kl": 2.82373046875, "learning_rate": 7.409724993206901e-07, "loss": 0.1943, "num_tokens": 62286995.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 111.1875, "completions/mean_terminated_length": 111.1875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.447151995126409, "grad_norm": 1.3807945251464844, "kl": 2.1240234375, "learning_rate": 7.377596651752728e-07, "loss": 0.1151, "num_tokens": 62300927.0, "reward": 1.97265625, "reward_std": 0.07733980566263199, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 3647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 99.265625, "completions/mean_terminated_length": 99.265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 4.448370392933293, "grad_norm": 1.2499046325683594, "kl": 2.35009765625, "learning_rate": 7.345535447468822e-07, "loss": 0.1394, "num_tokens": 62313936.0, "reward": 1.93359375, "reward_std": 0.18782523274421692, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 88.75, "completions/mean_terminated_length": 88.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.449588790740177, "grad_norm": 1.2866004705429077, "kl": 3.2099609375, "learning_rate": 7.313541403594748e-07, "loss": 0.1517, "num_tokens": 62326168.0, "reward": 1.87890625, "reward_std": 0.2814556658267975, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.21704266965389252, "step": 3649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 127.609375, "completions/mean_terminated_length": 127.609375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.450807188547061, "grad_norm": 0.28866642713546753, "kl": 0.41552734375, "learning_rate": 7.281614543321269e-07, "loss": 0.0166, "num_tokens": 62341743.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 117.734375, "completions/mean_terminated_length": 117.734375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 4.452025586353945, "grad_norm": 1.5128835439682007, "kl": 2.24072265625, "learning_rate": 7.249754889790539e-07, "loss": 0.1153, "num_tokens": 62356910.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 102.203125, "completions/mean_terminated_length": 102.203125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 4.453243984160829, "grad_norm": 0.14461329579353333, "kl": 0.3974609375, "learning_rate": 7.217962466095962e-07, "loss": 0.0159, "num_tokens": 62369931.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 132.6875, "completions/mean_terminated_length": 132.6875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.454462381967712, "grad_norm": 1.560357689857483, "kl": 1.818359375, "learning_rate": 7.186237295282217e-07, "loss": 0.1483, "num_tokens": 62386391.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 103.28125, "completions/mean_terminated_length": 103.28125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.455680779774596, "grad_norm": 0.9062807559967041, "kl": 1.83984375, "learning_rate": 7.154579400345241e-07, "loss": 0.0549, "num_tokens": 62400513.0, "reward": 1.9296875, "reward_std": 0.16262178122997284, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 104.234375, "completions/mean_terminated_length": 104.234375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.45689917758148, "grad_norm": 0.7568832039833069, "kl": 2.11376953125, "learning_rate": 7.122988804232178e-07, "loss": 0.1281, "num_tokens": 62414168.0, "reward": 1.97265625, "reward_std": 0.07733980566263199, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 3655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 111.734375, "completions/mean_terminated_length": 111.734375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.458117575388364, "grad_norm": 1.324247121810913, "kl": 3.728515625, "learning_rate": 7.091465529841434e-07, "loss": 0.2494, "num_tokens": 62428319.0, "reward": 1.97265625, "reward_std": 0.07733980566263199, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 3656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 115.5625, "completions/mean_terminated_length": 115.5625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.459335973195248, "grad_norm": 0.8590952754020691, "kl": 1.18505859375, "learning_rate": 7.060009600022566e-07, "loss": 0.0469, "num_tokens": 62442803.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 96.734375, "completions/mean_terminated_length": 96.734375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.460554371002132, "grad_norm": 1.3072298765182495, "kl": 0.490234375, "learning_rate": 7.028621037576355e-07, "loss": 0.0211, "num_tokens": 62455762.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 121.921875, "completions/mean_terminated_length": 121.921875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.461772768809016, "grad_norm": 0.8263190984725952, "kl": 0.91162109375, "learning_rate": 6.997299865254748e-07, "loss": 0.0363, "num_tokens": 62470925.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 3659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 111.578125, "completions/mean_terminated_length": 111.578125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.4629911666159, "grad_norm": 0.760124921798706, "kl": 1.3017578125, "learning_rate": 6.966046105760826e-07, "loss": 0.0932, "num_tokens": 62484890.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 110.015625, "completions/mean_terminated_length": 110.015625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.464209564422784, "grad_norm": 0.6108146905899048, "kl": 0.986328125, "learning_rate": 6.934859781748848e-07, "loss": 0.0402, "num_tokens": 62499235.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 105.421875, "completions/mean_terminated_length": 105.421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 4.465427962229668, "grad_norm": 1.0537117719650269, "kl": 1.8896484375, "learning_rate": 6.903740915824109e-07, "loss": 0.0373, "num_tokens": 62513326.0, "reward": 1.8984375, "reward_std": 0.2872621417045593, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.17938801646232605, "step": 3662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 106.84375, "completions/mean_terminated_length": 106.84375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.466646360036552, "grad_norm": 1.1307824850082397, "kl": 2.248046875, "learning_rate": 6.872689530543087e-07, "loss": 0.0784, "num_tokens": 62527332.0, "reward": 1.8984375, "reward_std": 0.2352125495672226, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 3663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 96.453125, "completions/mean_terminated_length": 96.453125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.467864757843436, "grad_norm": 0.952768862247467, "kl": 0.4365234375, "learning_rate": 6.841705648413322e-07, "loss": 0.0083, "num_tokens": 62539529.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 113.53125, "completions/mean_terminated_length": 113.53125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.46908315565032, "grad_norm": 1.0768061876296997, "kl": 2.06591796875, "learning_rate": 6.810789291893427e-07, "loss": 0.0896, "num_tokens": 62554491.0, "reward": 1.93359375, "reward_std": 0.13859297335147858, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1118449866771698, "step": 3665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 125.484375, "completions/mean_terminated_length": 125.484375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.470301553457204, "grad_norm": 0.14279665052890778, "kl": 0.38916015625, "learning_rate": 6.779940483393033e-07, "loss": 0.0155, "num_tokens": 62570170.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 117.25, "completions/mean_terminated_length": 117.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.471519951264088, "grad_norm": 2.691854953765869, "kl": 5.13134765625, "learning_rate": 6.749159245272874e-07, "loss": 0.3047, "num_tokens": 62584090.0, "reward": 1.953125, "reward_std": 0.0867956355214119, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08768405020236969, "step": 3667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 88.328125, "completions/mean_terminated_length": 88.328125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.472738349070972, "grad_norm": 0.6873770952224731, "kl": 1.30126953125, "learning_rate": 6.718445599844637e-07, "loss": 0.0132, "num_tokens": 62595991.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 106.921875, "completions/mean_terminated_length": 106.921875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.473956746877856, "grad_norm": 0.8289063572883606, "kl": 1.06787109375, "learning_rate": 6.687799569371079e-07, "loss": 0.0786, "num_tokens": 62609898.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 100.25, "completions/mean_terminated_length": 100.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.47517514468474, "grad_norm": 0.7533006072044373, "kl": 0.76611328125, "learning_rate": 6.657221176065887e-07, "loss": 0.0369, "num_tokens": 62623034.0, "reward": 1.99609375, "reward_std": 0.011048543266952038, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 101.3125, "completions/mean_terminated_length": 101.3125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.476393542491624, "grad_norm": 0.6242942214012146, "kl": 0.7783203125, "learning_rate": 6.626710442093776e-07, "loss": 0.0381, "num_tokens": 62636606.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 106.828125, "completions/mean_terminated_length": 106.828125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.477611940298507, "grad_norm": 0.45174115896224976, "kl": 0.6748046875, "learning_rate": 6.596267389570388e-07, "loss": 0.0085, "num_tokens": 62650451.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 111.34375, "completions/mean_terminated_length": 111.34375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 4.478830338105391, "grad_norm": 0.5865659713745117, "kl": 0.6708984375, "learning_rate": 6.565892040562317e-07, "loss": 0.0312, "num_tokens": 62664625.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 104.234375, "completions/mean_terminated_length": 104.234375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.480048735912275, "grad_norm": 1.9175195693969727, "kl": 1.8798828125, "learning_rate": 6.535584417087071e-07, "loss": 0.0695, "num_tokens": 62678496.0, "reward": 1.91015625, "reward_std": 0.2541165053844452, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 3674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 106.9375, "completions/mean_terminated_length": 106.9375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.481267133719159, "grad_norm": 1.7107393741607666, "kl": 4.0986328125, "learning_rate": 6.505344541113046e-07, "loss": 0.1859, "num_tokens": 62692404.0, "reward": 1.8515625, "reward_std": 0.362932026386261, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.20152568817138672, "step": 3675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 112.515625, "completions/mean_terminated_length": 112.515625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.482485531526043, "grad_norm": 0.5715935826301575, "kl": 0.77294921875, "learning_rate": 6.475172434559573e-07, "loss": 0.0186, "num_tokens": 62706557.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 116.5625, "completions/mean_terminated_length": 116.5625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.483703929332927, "grad_norm": 0.8649886250495911, "kl": 1.57958984375, "learning_rate": 6.44506811929686e-07, "loss": 0.0717, "num_tokens": 62721377.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 114.859375, "completions/mean_terminated_length": 114.859375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.484922327139811, "grad_norm": 0.7131067514419556, "kl": 0.40283203125, "learning_rate": 6.415031617145951e-07, "loss": 0.0087, "num_tokens": 62735816.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 101.734375, "completions/mean_terminated_length": 101.734375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.486140724946695, "grad_norm": 0.5286831259727478, "kl": 0.56591796875, "learning_rate": 6.385062949878751e-07, "loss": 0.0174, "num_tokens": 62748903.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 101.5, "completions/mean_terminated_length": 101.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.487359122753579, "grad_norm": 1.126868486404419, "kl": 1.20654296875, "learning_rate": 6.355162139217996e-07, "loss": 0.0458, "num_tokens": 62762127.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 102.28125, "completions/mean_terminated_length": 102.28125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.488577520560463, "grad_norm": 0.7513738870620728, "kl": 0.53515625, "learning_rate": 6.325329206837217e-07, "loss": -0.0102, "num_tokens": 62775185.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 113.0625, "completions/mean_terminated_length": 113.0625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.489795918367347, "grad_norm": 1.1124948263168335, "kl": 1.79931640625, "learning_rate": 6.295564174360769e-07, "loss": 0.0782, "num_tokens": 62790045.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 3682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 4.491014316174231, "grad_norm": 0.8604798316955566, "kl": 1.31982421875, "learning_rate": 6.265867063363762e-07, "loss": 0.0237, "num_tokens": 62805397.0, "reward": 1.93359375, "reward_std": 0.18782523274421692, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 116.8125, "completions/mean_terminated_length": 116.8125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.492232713981115, "grad_norm": 1.21648108959198, "kl": 1.52490234375, "learning_rate": 6.2362378953721e-07, "loss": 0.0052, "num_tokens": 62820065.0, "reward": 1.9140625, "reward_std": 0.24306796491146088, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 99.796875, "completions/mean_terminated_length": 99.796875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 4.493451111787999, "grad_norm": 0.45362839102745056, "kl": 0.6357421875, "learning_rate": 6.206676691862435e-07, "loss": -0.0002, "num_tokens": 62833252.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 115.828125, "completions/mean_terminated_length": 115.828125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.494669509594883, "grad_norm": 0.16504167020320892, "kl": 0.44873046875, "learning_rate": 6.177183474262149e-07, "loss": 0.0179, "num_tokens": 62848313.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 109.203125, "completions/mean_terminated_length": 109.203125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.495887907401767, "grad_norm": 1.3113983869552612, "kl": 0.83740234375, "learning_rate": 6.147758263949322e-07, "loss": 0.0138, "num_tokens": 62862374.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 121.109375, "completions/mean_terminated_length": 121.109375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.497106305208651, "grad_norm": 1.0096569061279297, "kl": 0.8720703125, "learning_rate": 6.118401082252779e-07, "loss": 0.0437, "num_tokens": 62877765.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 123.796875, "completions/mean_terminated_length": 123.796875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.498324703015535, "grad_norm": 1.2917397022247314, "kl": 2.41259765625, "learning_rate": 6.089111950452009e-07, "loss": 0.1443, "num_tokens": 62893304.0, "reward": 1.93359375, "reward_std": 0.18782523274421692, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09241779148578644, "step": 3689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 104.40625, "completions/mean_terminated_length": 104.40625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.499543100822419, "grad_norm": 0.21186842024326324, "kl": 0.3759765625, "learning_rate": 6.059890889777198e-07, "loss": 0.015, "num_tokens": 62906786.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 105.265625, "completions/mean_terminated_length": 105.265625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.500761498629302, "grad_norm": 3.1128947734832764, "kl": 2.40478515625, "learning_rate": 6.030737921409169e-07, "loss": 0.0252, "num_tokens": 62920235.0, "reward": 1.875, "reward_std": 0.292504221200943, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.24397502839565277, "step": 3691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 100.34375, "completions/mean_terminated_length": 100.34375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.501979896436186, "grad_norm": 1.6817539930343628, "kl": 1.34765625, "learning_rate": 6.001653066479373e-07, "loss": 0.0227, "num_tokens": 62933337.0, "reward": 1.90625, "reward_std": 0.1694951355457306, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1298656165599823, "step": 3692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 100.203125, "completions/mean_terminated_length": 100.203125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 4.50319829424307, "grad_norm": 0.7507466673851013, "kl": 0.44677734375, "learning_rate": 5.972636346069949e-07, "loss": -0.0115, "num_tokens": 62946342.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 108.828125, "completions/mean_terminated_length": 108.828125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.504416692049954, "grad_norm": 0.5364843010902405, "kl": 0.40283203125, "learning_rate": 5.94368778121357e-07, "loss": 0.0123, "num_tokens": 62960411.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 124.28125, "completions/mean_terminated_length": 124.28125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 4.505635089856838, "grad_norm": 1.6590712070465088, "kl": 2.9951171875, "learning_rate": 5.914807392893573e-07, "loss": 0.1028, "num_tokens": 62975773.0, "reward": 1.88671875, "reward_std": 0.18463000655174255, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.21445617079734802, "step": 3695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 102.015625, "completions/mean_terminated_length": 102.015625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.506853487663722, "grad_norm": 0.7333009243011475, "kl": 1.953125, "learning_rate": 5.885995202043848e-07, "loss": 0.1014, "num_tokens": 62988846.0, "reward": 1.953125, "reward_std": 0.09300297498703003, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 110.75, "completions/mean_terminated_length": 110.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 4.508071885470606, "grad_norm": 1.2034343481063843, "kl": 1.40478515625, "learning_rate": 5.857251229548844e-07, "loss": 0.0793, "num_tokens": 63002814.0, "reward": 1.93359375, "reward_std": 0.1878252476453781, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 90.578125, "completions/mean_terminated_length": 90.578125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.50929028327749, "grad_norm": 1.1343767642974854, "kl": 1.35400390625, "learning_rate": 5.828575496243605e-07, "loss": 0.0466, "num_tokens": 63014939.0, "reward": 1.92578125, "reward_std": 0.16838817298412323, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 3698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 109.53125, "completions/mean_terminated_length": 109.53125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.510508681084374, "grad_norm": 1.286080002784729, "kl": 2.38037109375, "learning_rate": 5.799968022913638e-07, "loss": 0.1868, "num_tokens": 63028733.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 109.390625, "completions/mean_terminated_length": 109.390625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.5117270788912585, "grad_norm": 0.8885599970817566, "kl": 1.51171875, "learning_rate": 5.771428830295056e-07, "loss": 0.0342, "num_tokens": 63042606.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 3700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 104.03125, "completions/mean_terminated_length": 104.03125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.512945476698142, "grad_norm": 1.8223626613616943, "kl": 0.4560546875, "learning_rate": 5.742957939074412e-07, "loss": 0.0183, "num_tokens": 63056032.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 113.609375, "completions/mean_terminated_length": 113.609375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 4.514163874505026, "grad_norm": 1.0450562238693237, "kl": 2.009765625, "learning_rate": 5.714555369888774e-07, "loss": 0.1622, "num_tokens": 63070415.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 99.3125, "completions/mean_terminated_length": 99.3125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.51538227231191, "grad_norm": 2.002150774002075, "kl": 2.13134765625, "learning_rate": 5.686221143325698e-07, "loss": 0.1553, "num_tokens": 63083443.0, "reward": 1.91796875, "reward_std": 0.1938636749982834, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.08097480982542038, "step": 3703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 92.140625, "completions/mean_terminated_length": 92.140625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.516600670118794, "grad_norm": 0.43970203399658203, "kl": 0.60400390625, "learning_rate": 5.6579552799232e-07, "loss": 0.0118, "num_tokens": 63095820.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 113.5625, "completions/mean_terminated_length": 113.5625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.517819067925678, "grad_norm": 1.0853028297424316, "kl": 2.8662109375, "learning_rate": 5.629757800169732e-07, "loss": 0.1593, "num_tokens": 63110232.0, "reward": 1.921875, "reward_std": 0.22097086906433105, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 108.296875, "completions/mean_terminated_length": 108.296875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.519037465732562, "grad_norm": 0.7876037359237671, "kl": 1.9267578125, "learning_rate": 5.601628724504182e-07, "loss": 0.1383, "num_tokens": 63124403.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 4.520255863539446, "grad_norm": 4.1348772048950195, "kl": 3.00537109375, "learning_rate": 5.573568073315838e-07, "loss": 0.1785, "num_tokens": 63140179.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 122.9375, "completions/mean_terminated_length": 122.9375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.52147426134633, "grad_norm": 1.523110270500183, "kl": 6.04541015625, "learning_rate": 5.545575866944441e-07, "loss": 0.4593, "num_tokens": 63154679.0, "reward": 1.91796875, "reward_std": 0.23201940953731537, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 115.03125, "completions/mean_terminated_length": 115.03125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.522692659153213, "grad_norm": 1.021601676940918, "kl": 1.49560546875, "learning_rate": 5.517652125680084e-07, "loss": 0.0741, "num_tokens": 63169433.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 119.46875, "completions/mean_terminated_length": 119.46875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 4.523911056960097, "grad_norm": 0.8358948826789856, "kl": 1.21826171875, "learning_rate": 5.489796869763208e-07, "loss": 0.1248, "num_tokens": 63184679.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 117.84375, "completions/mean_terminated_length": 117.84375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 4.525129454766981, "grad_norm": 0.4640582501888275, "kl": 0.755859375, "learning_rate": 5.462010119384665e-07, "loss": 0.0248, "num_tokens": 63199821.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 106.0625, "completions/mean_terminated_length": 106.0625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 4.526347852573865, "grad_norm": 1.2764885425567627, "kl": 1.93505859375, "learning_rate": 5.434291894685628e-07, "loss": 0.0448, "num_tokens": 63213697.0, "reward": 1.890625, "reward_std": 0.19918900728225708, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1985812783241272, "step": 3712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 102.03125, "completions/mean_terminated_length": 102.03125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.527566250380749, "grad_norm": 0.8332855701446533, "kl": 0.76171875, "learning_rate": 5.406642215757585e-07, "loss": 0.0521, "num_tokens": 63226987.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 114.96875, "completions/mean_terminated_length": 114.96875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.528784648187633, "grad_norm": 0.3179704546928406, "kl": 0.52978515625, "learning_rate": 5.379061102642357e-07, "loss": -0.0001, "num_tokens": 63241809.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 124.65625, "completions/mean_terminated_length": 124.65625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 4.530003045994517, "grad_norm": 1.2214596271514893, "kl": 1.34716796875, "learning_rate": 5.351548575332056e-07, "loss": 0.0172, "num_tokens": 63257995.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 112.1875, "completions/mean_terminated_length": 112.1875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 4.531221443801401, "grad_norm": 0.9845108985900879, "kl": 1.275390625, "learning_rate": 5.324104653769102e-07, "loss": -0.0114, "num_tokens": 63272679.0, "reward": 1.90234375, "reward_std": 0.22989216446876526, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 122.828125, "completions/mean_terminated_length": 122.828125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.532439841608285, "grad_norm": 1.015105128288269, "kl": 1.95166015625, "learning_rate": 5.296729357846176e-07, "loss": 0.1118, "num_tokens": 63288196.0, "reward": 1.9296875, "reward_std": 0.15255236625671387, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 98.25, "completions/mean_terminated_length": 98.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.533658239415169, "grad_norm": 0.8635534048080444, "kl": 1.3046875, "learning_rate": 5.26942270740618e-07, "loss": 0.0196, "num_tokens": 63301284.0, "reward": 1.93359375, "reward_std": 0.1878252476453781, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 115.09375, "completions/mean_terminated_length": 115.09375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.5348766372220535, "grad_norm": 0.14451856911182404, "kl": 0.3349609375, "learning_rate": 5.242184722242282e-07, "loss": 0.0134, "num_tokens": 63316010.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 103.578125, "completions/mean_terminated_length": 103.578125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 4.536095035028937, "grad_norm": 0.659485399723053, "kl": 0.7783203125, "learning_rate": 5.215015422097902e-07, "loss": 0.0124, "num_tokens": 63329063.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 108.125, "completions/mean_terminated_length": 108.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.537313432835821, "grad_norm": 0.8324778079986572, "kl": 1.2294921875, "learning_rate": 5.187914826666662e-07, "loss": 0.0364, "num_tokens": 63343167.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 97.296875, "completions/mean_terminated_length": 97.296875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.538531830642705, "grad_norm": 0.5334643721580505, "kl": 0.45458984375, "learning_rate": 5.16088295559235e-07, "loss": -0.0088, "num_tokens": 63356146.0, "reward": 1.9609375, "reward_std": 0.07232969254255295, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 119.515625, "completions/mean_terminated_length": 119.515625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.539750228449589, "grad_norm": 0.6911762952804565, "kl": 0.95751953125, "learning_rate": 5.133919828468992e-07, "loss": 0.0065, "num_tokens": 63372019.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 109.328125, "completions/mean_terminated_length": 109.328125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.540968626256473, "grad_norm": 0.5797182321548462, "kl": 1.7275390625, "learning_rate": 5.107025464840754e-07, "loss": 0.1018, "num_tokens": 63386184.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 110.1875, "completions/mean_terminated_length": 110.1875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.542187024063357, "grad_norm": 0.9738275408744812, "kl": 2.43603515625, "learning_rate": 5.080199884201953e-07, "loss": 0.1728, "num_tokens": 63400380.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 123.4375, "completions/mean_terminated_length": 123.4375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.543405421870241, "grad_norm": 0.9733688235282898, "kl": 1.7119140625, "learning_rate": 5.053443105997069e-07, "loss": 0.0284, "num_tokens": 63416008.0, "reward": 1.921875, "reward_std": 0.22097086906433105, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 108.625, "completions/mean_terminated_length": 108.625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 4.544623819677125, "grad_norm": 0.7618336081504822, "kl": 1.34765625, "learning_rate": 5.026755149620688e-07, "loss": -0.0247, "num_tokens": 63430008.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 3727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 103.578125, "completions/mean_terminated_length": 103.578125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.545842217484008, "grad_norm": 1.4400227069854736, "kl": 1.19091796875, "learning_rate": 5.000136034417524e-07, "loss": 0.0263, "num_tokens": 63443501.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 97.40625, "completions/mean_terminated_length": 97.40625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 4.547060615290892, "grad_norm": 0.21403753757476807, "kl": 0.51025390625, "learning_rate": 4.9735857796824e-07, "loss": 0.001, "num_tokens": 63456455.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 106.15625, "completions/mean_terminated_length": 106.15625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.548279013097776, "grad_norm": 0.41299647092819214, "kl": 0.5634765625, "learning_rate": 4.947104404660219e-07, "loss": 0.0039, "num_tokens": 63469681.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 124.9375, "completions/mean_terminated_length": 124.9375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 4.54949741090466, "grad_norm": 1.8500182628631592, "kl": 3.9951171875, "learning_rate": 4.920691928545973e-07, "loss": 0.3767, "num_tokens": 63484717.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 116.921875, "completions/mean_terminated_length": 116.921875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.550715808711544, "grad_norm": 1.01157546043396, "kl": 2.87939453125, "learning_rate": 4.894348370484648e-07, "loss": 0.2031, "num_tokens": 63499840.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 87.40625, "completions/mean_terminated_length": 87.40625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.551934206518428, "grad_norm": 0.19051513075828552, "kl": 0.4609375, "learning_rate": 4.868073749571345e-07, "loss": 0.0184, "num_tokens": 63511450.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 102.46875, "completions/mean_terminated_length": 102.46875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.553152604325312, "grad_norm": 0.542101263999939, "kl": 0.81787109375, "learning_rate": 4.841868084851175e-07, "loss": 0.0023, "num_tokens": 63524336.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 105.765625, "completions/mean_terminated_length": 105.765625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.554371002132196, "grad_norm": 0.21195068955421448, "kl": 0.4091796875, "learning_rate": 4.815731395319278e-07, "loss": 0.0164, "num_tokens": 63538329.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 119.203125, "completions/mean_terminated_length": 119.203125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 4.55558939993908, "grad_norm": 0.15708865225315094, "kl": 0.39794921875, "learning_rate": 4.789663699920754e-07, "loss": 0.0159, "num_tokens": 63554062.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 103.234375, "completions/mean_terminated_length": 103.234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 4.5568077977459645, "grad_norm": 0.9557214975357056, "kl": 2.02392578125, "learning_rate": 4.763665017550745e-07, "loss": 0.0722, "num_tokens": 63568173.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 106.265625, "completions/mean_terminated_length": 106.265625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.5580261955528485, "grad_norm": 0.10777042806148529, "kl": 0.357421875, "learning_rate": 4.737735367054319e-07, "loss": 0.0143, "num_tokens": 63582166.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 102.96875, "completions/mean_terminated_length": 102.96875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 4.559244593359732, "grad_norm": 1.229853868484497, "kl": 1.6953125, "learning_rate": 4.711874767226554e-07, "loss": 0.0471, "num_tokens": 63595740.0, "reward": 1.88671875, "reward_std": 0.27887362241744995, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 3739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 123.1875, "completions/mean_terminated_length": 123.1875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.560462991166616, "grad_norm": 0.13630397617816925, "kl": 0.3662109375, "learning_rate": 4.686083236812444e-07, "loss": 0.0147, "num_tokens": 63611240.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 106.0625, "completions/mean_terminated_length": 106.0625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 4.5616813889735, "grad_norm": 0.9122181534767151, "kl": 1.0576171875, "learning_rate": 4.660360794506946e-07, "loss": -0.0047, "num_tokens": 63625204.0, "reward": 1.890625, "reward_std": 0.30935919284820557, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 3741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 106.765625, "completions/mean_terminated_length": 106.765625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 4.562899786780384, "grad_norm": 2.1501190662384033, "kl": 0.64013671875, "learning_rate": 4.634707458954901e-07, "loss": -0.0094, "num_tokens": 63638701.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 108.3125, "completions/mean_terminated_length": 108.3125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.564118184587268, "grad_norm": 2.379937171936035, "kl": 1.5341796875, "learning_rate": 4.609123248751124e-07, "loss": 0.0237, "num_tokens": 63653049.0, "reward": 1.9453125, "reward_std": 0.10263003408908844, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 99.390625, "completions/mean_terminated_length": 99.390625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.565336582394152, "grad_norm": 1.8145102262496948, "kl": 1.94677734375, "learning_rate": 4.5836081824402356e-07, "loss": 0.0464, "num_tokens": 63666058.0, "reward": 1.87890625, "reward_std": 0.2814556658267975, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 3744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 112.59375, "completions/mean_terminated_length": 112.59375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 4.566554980201036, "grad_norm": 1.3411273956298828, "kl": 0.93603515625, "learning_rate": 4.558162278516787e-07, "loss": -0.0092, "num_tokens": 63680480.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 109.0625, "completions/mean_terminated_length": 109.0625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.56777337800792, "grad_norm": 1.204108476638794, "kl": 3.4130859375, "learning_rate": 4.53278555542519e-07, "loss": 0.2367, "num_tokens": 63694356.0, "reward": 1.9296875, "reward_std": 0.16071803867816925, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 121.625, "completions/mean_terminated_length": 121.625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.568991775814803, "grad_norm": 0.8329053521156311, "kl": 0.66015625, "learning_rate": 4.5074780315597313e-07, "loss": -0.0158, "num_tokens": 63710116.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1118449866771698, "step": 3747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 124.078125, "completions/mean_terminated_length": 124.078125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 4.570210173621687, "grad_norm": 0.4989815652370453, "kl": 0.58447265625, "learning_rate": 4.4822397252645256e-07, "loss": 0.0313, "num_tokens": 63725481.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 126.65625, "completions/mean_terminated_length": 126.65625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.571428571428571, "grad_norm": 0.2329147309064865, "kl": 0.36669921875, "learning_rate": 4.4570706548334954e-07, "loss": 0.0147, "num_tokens": 63741659.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 109.859375, "completions/mean_terminated_length": 109.859375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.572646969235455, "grad_norm": 2.0941503047943115, "kl": 1.2958984375, "learning_rate": 4.431970838510391e-07, "loss": 0.0534, "num_tokens": 63755954.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 120.1875, "completions/mean_terminated_length": 120.1875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.573865367042339, "grad_norm": 0.7065665125846863, "kl": 1.54150390625, "learning_rate": 4.406940294488771e-07, "loss": 0.0582, "num_tokens": 63771174.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 89.609375, "completions/mean_terminated_length": 89.609375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 4.575083764849223, "grad_norm": 0.37394002079963684, "kl": 0.5341796875, "learning_rate": 4.3819790409119767e-07, "loss": -0.0055, "num_tokens": 63783805.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 95.671875, "completions/mean_terminated_length": 95.671875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.576302162656107, "grad_norm": 0.767173707485199, "kl": 0.72705078125, "learning_rate": 4.357087095873136e-07, "loss": 0.0322, "num_tokens": 63796608.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 94.1875, "completions/mean_terminated_length": 94.1875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 4.577520560462991, "grad_norm": 1.0883759260177612, "kl": 0.82568359375, "learning_rate": 4.332264477415105e-07, "loss": 0.0074, "num_tokens": 63808820.0, "reward": 1.93359375, "reward_std": 0.18782523274421692, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 3754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 101.703125, "completions/mean_terminated_length": 101.703125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 4.578738958269875, "grad_norm": 0.3496154248714447, "kl": 0.587890625, "learning_rate": 4.3075112035305233e-07, "loss": -0.0031, "num_tokens": 63821697.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 106.09375, "completions/mean_terminated_length": 106.09375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.5799573560767595, "grad_norm": 1.0244803428649902, "kl": 1.478515625, "learning_rate": 4.282827292161762e-07, "loss": 0.0824, "num_tokens": 63835623.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 100.90625, "completions/mean_terminated_length": 100.90625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 4.5811757538836435, "grad_norm": 0.1699952781200409, "kl": 0.4033203125, "learning_rate": 4.258212761200875e-07, "loss": 0.0162, "num_tokens": 63849233.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 116.640625, "completions/mean_terminated_length": 116.640625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 4.582394151690527, "grad_norm": 0.750869870185852, "kl": 1.04248046875, "learning_rate": 4.2336676284896907e-07, "loss": -0.0086, "num_tokens": 63864082.0, "reward": 1.91796875, "reward_std": 0.23201942443847656, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 110.34375, "completions/mean_terminated_length": 110.34375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.583612549497411, "grad_norm": 0.5989825129508972, "kl": 0.79736328125, "learning_rate": 4.209191911819688e-07, "loss": -0.0211, "num_tokens": 63878432.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 106.015625, "completions/mean_terminated_length": 106.015625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.584830947304295, "grad_norm": 2.4867894649505615, "kl": 3.0751953125, "learning_rate": 4.1847856289320423e-07, "loss": 0.1311, "num_tokens": 63892025.0, "reward": 1.86328125, "reward_std": 0.32564985752105713, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 3760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 91.359375, "completions/mean_terminated_length": 91.359375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 4.586049345111179, "grad_norm": 0.4761410653591156, "kl": 0.58349609375, "learning_rate": 4.1604487975176136e-07, "loss": -0.0067, "num_tokens": 63904552.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 101.640625, "completions/mean_terminated_length": 101.640625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.587267742918063, "grad_norm": 0.5428698062896729, "kl": 0.53857421875, "learning_rate": 4.1361814352169014e-07, "loss": 0.0166, "num_tokens": 63917585.0, "reward": 1.95703125, "reward_std": 0.07999982684850693, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 160.390625, "completions/mean_terminated_length": 160.390625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.588486140724947, "grad_norm": 0.7239593863487244, "kl": 0.63037109375, "learning_rate": 4.1119835596200474e-07, "loss": -0.0151, "num_tokens": 63934818.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 124.734375, "completions/mean_terminated_length": 124.734375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.589704538531831, "grad_norm": 0.7245201468467712, "kl": 1.28564453125, "learning_rate": 4.087855188266832e-07, "loss": -0.0133, "num_tokens": 63949425.0, "reward": 1.86328125, "reward_std": 0.3403775990009308, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.2215663492679596, "step": 3764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 115.296875, "completions/mean_terminated_length": 115.296875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 4.590922936338715, "grad_norm": 1.3781201839447021, "kl": 2.23486328125, "learning_rate": 4.063796338646664e-07, "loss": 0.1274, "num_tokens": 63963716.0, "reward": 1.94140625, "reward_std": 0.1275724172592163, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 3765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 110.40625, "completions/mean_terminated_length": 110.40625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.592141334145598, "grad_norm": 1.2039921283721924, "kl": 0.64404296875, "learning_rate": 4.03980702819855e-07, "loss": -0.0151, "num_tokens": 63977966.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 98.515625, "completions/mean_terminated_length": 98.515625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.593359731952482, "grad_norm": 1.9082257747650146, "kl": 2.32958984375, "learning_rate": 4.015887274311114e-07, "loss": 0.1314, "num_tokens": 63990807.0, "reward": 1.9140625, "reward_std": 0.24306795001029968, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 3767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 97.703125, "completions/mean_terminated_length": 97.703125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.594578129759366, "grad_norm": 0.1561286300420761, "kl": 0.40185546875, "learning_rate": 3.992037094322532e-07, "loss": 0.0161, "num_tokens": 64003884.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 113.890625, "completions/mean_terminated_length": 113.890625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.59579652756625, "grad_norm": 0.6908033490180969, "kl": 1.2958984375, "learning_rate": 3.9682565055205514e-07, "loss": 0.044, "num_tokens": 64018381.0, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 101.34375, "completions/mean_terminated_length": 101.34375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.597014925373134, "grad_norm": 0.7909918427467346, "kl": 1.54296875, "learning_rate": 3.944545525142507e-07, "loss": 0.0477, "num_tokens": 64031843.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1118449866771698, "step": 3770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 98.40625, "completions/mean_terminated_length": 98.40625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 4.598233323180018, "grad_norm": 0.43603068590164185, "kl": 0.845703125, "learning_rate": 3.920904170375239e-07, "loss": 0.0024, "num_tokens": 64044949.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 110.453125, "completions/mean_terminated_length": 110.453125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.599451720986902, "grad_norm": 0.8433831930160522, "kl": 0.8896484375, "learning_rate": 3.89733245835513e-07, "loss": -0.0131, "num_tokens": 64059210.0, "reward": 1.91796875, "reward_std": 0.185698002576828, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 105.25, "completions/mean_terminated_length": 105.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 4.600670118793786, "grad_norm": 1.083461046218872, "kl": 2.369140625, "learning_rate": 3.8738304061681107e-07, "loss": 0.1419, "num_tokens": 64072682.0, "reward": 1.96484375, "reward_std": 0.09943688660860062, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 94.515625, "completions/mean_terminated_length": 94.515625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.60188851660067, "grad_norm": 0.4185962677001953, "kl": 0.52685546875, "learning_rate": 3.8503980308496004e-07, "loss": 0.0076, "num_tokens": 64085395.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 97.046875, "completions/mean_terminated_length": 97.046875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.6031069144075545, "grad_norm": 1.2667579650878906, "kl": 1.482421875, "learning_rate": 3.8270353493845005e-07, "loss": 0.0516, "num_tokens": 64098094.0, "reward": 1.9296875, "reward_std": 0.16071803867816925, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 100.015625, "completions/mean_terminated_length": 100.015625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 4.604325312214438, "grad_norm": 1.2187390327453613, "kl": 3.38134765625, "learning_rate": 3.8037423787071983e-07, "loss": 0.1919, "num_tokens": 64111135.0, "reward": 1.9140625, "reward_std": 0.19101837277412415, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 3776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 123.59375, "completions/mean_terminated_length": 123.59375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.605543710021322, "grad_norm": 0.49792760610580444, "kl": 0.900390625, "learning_rate": 3.780519135701566e-07, "loss": 0.027, "num_tokens": 64127013.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 123.140625, "completions/mean_terminated_length": 123.140625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.606762107828206, "grad_norm": 2.5020790100097656, "kl": 3.74755859375, "learning_rate": 3.757365637200927e-07, "loss": 0.1011, "num_tokens": 64142966.0, "reward": 1.8515625, "reward_std": 0.30674588680267334, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.2498759627342224, "step": 3778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 89.828125, "completions/mean_terminated_length": 89.828125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.60798050563509, "grad_norm": 1.402538776397705, "kl": 0.63330078125, "learning_rate": 3.734281899988068e-07, "loss": -0.0032, "num_tokens": 64155331.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 86.890625, "completions/mean_terminated_length": 86.890625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 4.609198903441974, "grad_norm": 0.8223384022712708, "kl": 1.12646484375, "learning_rate": 3.71126794079516e-07, "loss": 0.0419, "num_tokens": 64167564.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 111.875, "completions/mean_terminated_length": 111.875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.610417301248858, "grad_norm": 0.1817001849412918, "kl": 0.40283203125, "learning_rate": 3.688323776303837e-07, "loss": 0.0161, "num_tokens": 64181788.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 115.625, "completions/mean_terminated_length": 115.625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 4.611635699055742, "grad_norm": 1.259257435798645, "kl": 2.806640625, "learning_rate": 3.665449423145151e-07, "loss": 0.1264, "num_tokens": 64197004.0, "reward": 1.890625, "reward_std": 0.30935922265052795, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 3782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 104.59375, "completions/mean_terminated_length": 104.59375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.612854096862626, "grad_norm": 0.7318357229232788, "kl": 0.47509765625, "learning_rate": 3.6426448978995054e-07, "loss": 0.0193, "num_tokens": 64210914.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 84.640625, "completions/mean_terminated_length": 84.640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 4.61407249466951, "grad_norm": 0.4737887680530548, "kl": 0.5439453125, "learning_rate": 3.619910217096723e-07, "loss": -0.0097, "num_tokens": 64222307.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 101.828125, "completions/mean_terminated_length": 101.828125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.615290892476393, "grad_norm": 0.9769932627677917, "kl": 1.72802734375, "learning_rate": 3.597245397216009e-07, "loss": 0.1111, "num_tokens": 64235784.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 99.046875, "completions/mean_terminated_length": 99.046875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 4.616509290283277, "grad_norm": 1.354239821434021, "kl": 1.9453125, "learning_rate": 3.574650454685902e-07, "loss": 0.0608, "num_tokens": 64249019.0, "reward": 1.8828125, "reward_std": 0.3314563035964966, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.2203386276960373, "step": 3786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 111.5625, "completions/mean_terminated_length": 111.5625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 4.617727688090161, "grad_norm": 1.3521060943603516, "kl": 1.36669921875, "learning_rate": 3.55212540588431e-07, "loss": -0.0426, "num_tokens": 64264055.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.24397502839565277, "step": 3787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 93.890625, "completions/mean_terminated_length": 93.890625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.618946085897045, "grad_norm": 0.18975216150283813, "kl": 0.38427734375, "learning_rate": 3.529670267138474e-07, "loss": 0.0154, "num_tokens": 64276480.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 110.9375, "completions/mean_terminated_length": 110.9375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 4.620164483703929, "grad_norm": 1.1866258382797241, "kl": 1.52734375, "learning_rate": 3.507285054724929e-07, "loss": 0.0531, "num_tokens": 64290956.0, "reward": 1.9140625, "reward_std": 0.24306795001029968, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 3789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 104.09375, "completions/mean_terminated_length": 104.09375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 4.621382881510813, "grad_norm": 1.264614462852478, "kl": 2.6015625, "learning_rate": 3.4849697848695853e-07, "loss": 0.0589, "num_tokens": 64304906.0, "reward": 1.84375, "reward_std": 0.32436180114746094, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.2136233150959015, "step": 3790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 116.125, "completions/mean_terminated_length": 116.125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 4.622601279317697, "grad_norm": 0.792011022567749, "kl": 1.44384765625, "learning_rate": 3.462724473747603e-07, "loss": 0.0663, "num_tokens": 64319514.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 95.84375, "completions/mean_terminated_length": 95.84375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 4.623819677124581, "grad_norm": 2.4770255088806152, "kl": 1.31640625, "learning_rate": 3.440549137483462e-07, "loss": -0.0391, "num_tokens": 64331752.0, "reward": 1.88671875, "reward_std": 0.3204077482223511, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.21445617079734802, "step": 3792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 109.0625, "completions/mean_terminated_length": 109.0625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.6250380749314655, "grad_norm": 1.4826604127883911, "kl": 1.08740234375, "learning_rate": 3.4184437921509163e-07, "loss": 0.0297, "num_tokens": 64345604.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 106.9375, "completions/mean_terminated_length": 106.9375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.6262564727383495, "grad_norm": 0.18056659400463104, "kl": 0.42626953125, "learning_rate": 3.3964084537729927e-07, "loss": 0.0171, "num_tokens": 64359328.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 99.6875, "completions/mean_terminated_length": 99.6875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 4.627474870545233, "grad_norm": 1.3275965452194214, "kl": 1.90673828125, "learning_rate": 3.374443138321937e-07, "loss": 0.1249, "num_tokens": 64372012.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 111.359375, "completions/mean_terminated_length": 111.359375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 4.628693268352117, "grad_norm": 0.6197124123573303, "kl": 0.99560546875, "learning_rate": 3.3525478617192687e-07, "loss": 0.0153, "num_tokens": 64386227.0, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 100.34375, "completions/mean_terminated_length": 100.34375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.629911666159001, "grad_norm": 0.5677672028541565, "kl": 0.62109375, "learning_rate": 3.3307226398357705e-07, "loss": 0.0164, "num_tokens": 64399281.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 113.609375, "completions/mean_terminated_length": 113.609375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.631130063965885, "grad_norm": 0.8528050184249878, "kl": 1.17431640625, "learning_rate": 3.308967488491366e-07, "loss": 0.045, "num_tokens": 64413352.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 102.90625, "completions/mean_terminated_length": 102.90625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.632348461772769, "grad_norm": 0.1566721498966217, "kl": 0.3828125, "learning_rate": 3.2872824234552627e-07, "loss": 0.0153, "num_tokens": 64426642.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 103.953125, "completions/mean_terminated_length": 103.953125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.633566859579653, "grad_norm": 0.8295358419418335, "kl": 0.8642578125, "learning_rate": 3.2656674604458426e-07, "loss": 0.0442, "num_tokens": 64439855.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 97.8125, "completions/mean_terminated_length": 97.8125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.634785257386537, "grad_norm": 1.1512370109558105, "kl": 1.18701171875, "learning_rate": 3.2441226151306403e-07, "loss": 0.0787, "num_tokens": 64452483.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 110.25, "completions/mean_terminated_length": 110.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.636003655193421, "grad_norm": 0.15445177257061005, "kl": 0.39013671875, "learning_rate": 3.222647903126419e-07, "loss": 0.0156, "num_tokens": 64466619.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 112.640625, "completions/mean_terminated_length": 112.640625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.637222053000304, "grad_norm": 0.1686825007200241, "kl": 0.462890625, "learning_rate": 3.201243339999083e-07, "loss": 0.0185, "num_tokens": 64480852.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 103.28125, "completions/mean_terminated_length": 103.28125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.638440450807188, "grad_norm": 1.074973464012146, "kl": 2.00634765625, "learning_rate": 3.1799089412636765e-07, "loss": 0.1055, "num_tokens": 64494302.0, "reward": 1.921875, "reward_std": 0.22097086906433105, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 117.4375, "completions/mean_terminated_length": 117.4375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.639658848614072, "grad_norm": 1.9560855627059937, "kl": 6.53466796875, "learning_rate": 3.158644722384407e-07, "loss": 0.4167, "num_tokens": 64508770.0, "reward": 1.88671875, "reward_std": 0.2740863561630249, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.21445617079734802, "step": 3805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 108.015625, "completions/mean_terminated_length": 108.015625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.640877246420956, "grad_norm": 1.0016165971755981, "kl": 1.3515625, "learning_rate": 3.1374506987746136e-07, "loss": 0.0052, "num_tokens": 64522795.0, "reward": 1.91796875, "reward_std": 0.23201942443847656, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 110.53125, "completions/mean_terminated_length": 110.53125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.64209564422784, "grad_norm": 0.6806345582008362, "kl": 1.23193359375, "learning_rate": 3.1163268857967277e-07, "loss": 0.0543, "num_tokens": 64537269.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 95.609375, "completions/mean_terminated_length": 95.609375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 4.643314042034724, "grad_norm": 0.9560636281967163, "kl": 1.333984375, "learning_rate": 3.095273298762291e-07, "loss": 0.0457, "num_tokens": 64549684.0, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 98.796875, "completions/mean_terminated_length": 98.796875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.644532439841608, "grad_norm": 0.12134388834238052, "kl": 0.365234375, "learning_rate": 3.074289952931975e-07, "loss": 0.0146, "num_tokens": 64562479.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 109.0, "completions/mean_terminated_length": 109.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.645750837648492, "grad_norm": 0.816624104976654, "kl": 1.369140625, "learning_rate": 3.0533768635155115e-07, "loss": 0.0365, "num_tokens": 64577023.0, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 95.203125, "completions/mean_terminated_length": 95.203125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.646969235455376, "grad_norm": 0.7539921998977661, "kl": 1.30908203125, "learning_rate": 3.03253404567172e-07, "loss": 0.0939, "num_tokens": 64589788.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 124.515625, "completions/mean_terminated_length": 124.515625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.6481876332622605, "grad_norm": 0.9630627632141113, "kl": 0.61474609375, "learning_rate": 3.01176151450846e-07, "loss": 0.0476, "num_tokens": 64605741.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 102.671875, "completions/mean_terminated_length": 102.671875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.6494060310691445, "grad_norm": 0.7610322833061218, "kl": 0.9287109375, "learning_rate": 2.9910592850826983e-07, "loss": 0.0017, "num_tokens": 64618992.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 107.046875, "completions/mean_terminated_length": 107.046875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.650624428876028, "grad_norm": 0.6992836594581604, "kl": 0.91259765625, "learning_rate": 2.970427372400353e-07, "loss": -0.0282, "num_tokens": 64632771.0, "reward": 1.91796875, "reward_std": 0.23201940953731537, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 101.234375, "completions/mean_terminated_length": 101.234375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.651842826682912, "grad_norm": 0.19451932609081268, "kl": 0.4072265625, "learning_rate": 2.949865791416473e-07, "loss": 0.0162, "num_tokens": 64645946.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 108.59375, "completions/mean_terminated_length": 108.59375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.653061224489796, "grad_norm": 0.503322184085846, "kl": 1.03466796875, "learning_rate": 2.9293745570350365e-07, "loss": 0.0554, "num_tokens": 64660496.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 95.578125, "completions/mean_terminated_length": 95.578125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.65427962229668, "grad_norm": 0.7920958399772644, "kl": 1.02001953125, "learning_rate": 2.908953684109117e-07, "loss": 0.0276, "num_tokens": 64673237.0, "reward": 1.93359375, "reward_std": 0.18782523274421692, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 3817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 99.359375, "completions/mean_terminated_length": 99.359375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 4.655498020103564, "grad_norm": 0.5459563136100769, "kl": 0.40869140625, "learning_rate": 2.8886031874407085e-07, "loss": 0.0318, "num_tokens": 64686708.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 110.078125, "completions/mean_terminated_length": 110.078125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.656716417910448, "grad_norm": 0.13124732673168182, "kl": 0.37841796875, "learning_rate": 2.868323081780877e-07, "loss": 0.0152, "num_tokens": 64700969.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 119.6875, "completions/mean_terminated_length": 119.6875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 4.657934815717332, "grad_norm": 1.0952417850494385, "kl": 1.58935546875, "learning_rate": 2.8481133818295647e-07, "loss": 0.0665, "num_tokens": 64716221.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 110.28125, "completions/mean_terminated_length": 110.28125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.659153213524216, "grad_norm": 1.1376310586929321, "kl": 2.392578125, "learning_rate": 2.8279741022357535e-07, "loss": 0.0875, "num_tokens": 64730263.0, "reward": 1.89453125, "reward_std": 0.29831066727638245, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18662993609905243, "step": 3821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 101.53125, "completions/mean_terminated_length": 101.53125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.660371611331099, "grad_norm": 0.2274738848209381, "kl": 0.4169921875, "learning_rate": 2.8079052575973764e-07, "loss": 0.0167, "num_tokens": 64743537.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 99.75, "completions/mean_terminated_length": 99.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.661590009137983, "grad_norm": 0.5307462215423584, "kl": 0.7177734375, "learning_rate": 2.787906862461287e-07, "loss": 0.0108, "num_tokens": 64756657.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 116.53125, "completions/mean_terminated_length": 116.53125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 4.662808406944867, "grad_norm": 0.8348912000656128, "kl": 1.1396484375, "learning_rate": 2.7679789313232785e-07, "loss": 0.0433, "num_tokens": 64771595.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 113.09375, "completions/mean_terminated_length": 113.09375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.664026804751751, "grad_norm": 0.40696847438812256, "kl": 0.63427734375, "learning_rate": 2.748121478628074e-07, "loss": -0.003, "num_tokens": 64786065.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 104.40625, "completions/mean_terminated_length": 104.40625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.665245202558635, "grad_norm": 1.6064378023147583, "kl": 1.4169921875, "learning_rate": 2.7283345187693264e-07, "loss": 0.0844, "num_tokens": 64799219.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08768405020236969, "step": 3826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 103.515625, "completions/mean_terminated_length": 103.515625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.666463600365519, "grad_norm": 0.7629457116127014, "kl": 0.7939453125, "learning_rate": 2.7086180660895524e-07, "loss": 0.0523, "num_tokens": 64812916.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 115.21875, "completions/mean_terminated_length": 115.21875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.667681998172403, "grad_norm": 3.409142017364502, "kl": 4.853515625, "learning_rate": 2.6889721348801854e-07, "loss": 0.3336, "num_tokens": 64827058.0, "reward": 1.97265625, "reward_std": 0.07733980566263199, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 3828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 104.3125, "completions/mean_terminated_length": 104.3125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.668900395979287, "grad_norm": 0.961280345916748, "kl": 1.6376953125, "learning_rate": 2.669396739381547e-07, "loss": 0.0046, "num_tokens": 64840902.0, "reward": 1.91796875, "reward_std": 0.23201942443847656, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 103.984375, "completions/mean_terminated_length": 103.984375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.6701187937861715, "grad_norm": 1.4666321277618408, "kl": 2.9208984375, "learning_rate": 2.649891893782841e-07, "loss": 0.1469, "num_tokens": 64854125.0, "reward": 1.92578125, "reward_std": 0.20992231369018555, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1550549566745758, "step": 3830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 103.46875, "completions/mean_terminated_length": 103.46875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.6713371915930555, "grad_norm": 0.6392114758491516, "kl": 0.5400390625, "learning_rate": 2.6304576122221035e-07, "loss": -0.0004, "num_tokens": 64867875.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 107.84375, "completions/mean_terminated_length": 107.84375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.6725555893999395, "grad_norm": 2.1581151485443115, "kl": 0.923828125, "learning_rate": 2.611093908786222e-07, "loss": 0.0423, "num_tokens": 64881625.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 127.578125, "completions/mean_terminated_length": 127.578125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.673773987206823, "grad_norm": 1.390701174736023, "kl": 1.80615234375, "learning_rate": 2.591800797510968e-07, "loss": 0.083, "num_tokens": 64896726.0, "reward": 1.9140625, "reward_std": 0.24306795001029968, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09676052629947662, "step": 3833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 105.0, "completions/mean_terminated_length": 105.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.674992385013707, "grad_norm": 0.6681347489356995, "kl": 1.08056640625, "learning_rate": 2.5725782923808897e-07, "loss": 0.0675, "num_tokens": 64910326.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 98.5625, "completions/mean_terminated_length": 98.5625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.676210782820591, "grad_norm": 1.3516159057617188, "kl": 1.1708984375, "learning_rate": 2.5534264073293957e-07, "loss": 0.0522, "num_tokens": 64923266.0, "reward": 1.9609375, "reward_std": 0.07423343509435654, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 112.125, "completions/mean_terminated_length": 112.125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 4.677429180627475, "grad_norm": 1.7296699285507202, "kl": 4.79736328125, "learning_rate": 2.5343451562386934e-07, "loss": 0.3388, "num_tokens": 64937458.0, "reward": 1.921875, "reward_std": 0.22097086906433105, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 95.546875, "completions/mean_terminated_length": 95.546875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.678647578434359, "grad_norm": 1.0445709228515625, "kl": 1.62548828125, "learning_rate": 2.515334552939785e-07, "loss": 0.0729, "num_tokens": 64950253.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 100.5625, "completions/mean_terminated_length": 100.5625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.679865976241243, "grad_norm": 0.7655278444290161, "kl": 0.408203125, "learning_rate": 2.496394611212483e-07, "loss": 0.0545, "num_tokens": 64963769.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 114.21875, "completions/mean_terminated_length": 114.21875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.681084374048127, "grad_norm": 2.1925437450408936, "kl": 1.40966796875, "learning_rate": 2.4775253447853477e-07, "loss": 0.0455, "num_tokens": 64978423.0, "reward": 1.94921875, "reward_std": 0.0973096489906311, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 98.765625, "completions/mean_terminated_length": 98.765625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.682302771855011, "grad_norm": 1.1010936498641968, "kl": 2.36279296875, "learning_rate": 2.45872676733574e-07, "loss": 0.1162, "num_tokens": 64991488.0, "reward": 1.92578125, "reward_std": 0.17176659405231476, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 3840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 113.34375, "completions/mean_terminated_length": 113.34375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 4.683521169661894, "grad_norm": 0.6224284768104553, "kl": 1.76318359375, "learning_rate": 2.439998892489781e-07, "loss": 0.1161, "num_tokens": 65006974.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 96.71875, "completions/mean_terminated_length": 96.71875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.684739567468778, "grad_norm": 1.5631150007247925, "kl": 1.66064453125, "learning_rate": 2.4213417338223247e-07, "loss": 0.0944, "num_tokens": 65020252.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 3842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 99.3125, "completions/mean_terminated_length": 99.3125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 4.685957965275662, "grad_norm": 0.7125867009162903, "kl": 0.41748046875, "learning_rate": 2.402755304856974e-07, "loss": 0.036, "num_tokens": 65033184.0, "reward": 1.953125, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 3843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 128.3125, "completions/mean_terminated_length": 128.3125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.687176363082546, "grad_norm": 0.6268688440322876, "kl": 0.3759765625, "learning_rate": 2.3842396190660754e-07, "loss": 0.0293, "num_tokens": 65049156.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 103.625, "completions/mean_terminated_length": 103.625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.68839476088943, "grad_norm": 0.8402151465415955, "kl": 1.166015625, "learning_rate": 2.365794689870682e-07, "loss": 0.0088, "num_tokens": 65062836.0, "reward": 1.93359375, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 3845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 95.015625, "completions/mean_terminated_length": 95.015625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.689613158696314, "grad_norm": 2.8252174854278564, "kl": 1.99609375, "learning_rate": 2.347420530640565e-07, "loss": 0.0842, "num_tokens": 65075325.0, "reward": 1.9453125, "reward_std": 0.10263003408908844, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 111.703125, "completions/mean_terminated_length": 111.703125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.690831556503198, "grad_norm": 0.8102311491966248, "kl": 0.92431640625, "learning_rate": 2.3291171546942048e-07, "loss": 0.0206, "num_tokens": 65089594.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 111.34375, "completions/mean_terminated_length": 111.34375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.692049954310082, "grad_norm": 1.2786054611206055, "kl": 2.462890625, "learning_rate": 2.3108845752987552e-07, "loss": 0.1551, "num_tokens": 65103872.0, "reward": 1.91015625, "reward_std": 0.2541165053844452, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 3848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 115.296875, "completions/mean_terminated_length": 115.296875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 4.6932683521169665, "grad_norm": 0.7731289863586426, "kl": 1.16845703125, "learning_rate": 2.292722805670078e-07, "loss": 0.0363, "num_tokens": 65118843.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 110.65625, "completions/mean_terminated_length": 110.65625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.6944867499238505, "grad_norm": 1.4401888847351074, "kl": 3.2265625, "learning_rate": 2.274631858972698e-07, "loss": 0.2577, "num_tokens": 65132885.0, "reward": 1.91015625, "reward_std": 0.2541165053844452, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 3850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 91.890625, "completions/mean_terminated_length": 91.890625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.6957051477307346, "grad_norm": 0.9804502725601196, "kl": 1.66455078125, "learning_rate": 2.2566117483197923e-07, "loss": 0.0972, "num_tokens": 65145326.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 100.78125, "completions/mean_terminated_length": 100.78125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.696923545537618, "grad_norm": 2.253025531768799, "kl": 1.6240234375, "learning_rate": 2.2386624867732132e-07, "loss": 0.0709, "num_tokens": 65158616.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 3852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 104.015625, "completions/mean_terminated_length": 104.015625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.698141943344502, "grad_norm": 0.5499392151832581, "kl": 1.30810546875, "learning_rate": 2.2207840873434417e-07, "loss": 0.0583, "num_tokens": 65172233.0, "reward": 1.94921875, "reward_std": 0.0973096489906311, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 102.234375, "completions/mean_terminated_length": 102.234375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 4.699360341151386, "grad_norm": 0.8561251759529114, "kl": 1.4033203125, "learning_rate": 2.2029765629895893e-07, "loss": 0.0553, "num_tokens": 65186040.0, "reward": 1.9375, "reward_std": 0.13719715178012848, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 107.703125, "completions/mean_terminated_length": 107.703125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.70057873895827, "grad_norm": 1.67141592502594, "kl": 1.51171875, "learning_rate": 2.1852399266194312e-07, "loss": 0.0267, "num_tokens": 65199949.0, "reward": 1.91015625, "reward_std": 0.2541165053844452, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.19697457551956177, "step": 3855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 88.71875, "completions/mean_terminated_length": 88.71875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 4.701797136765154, "grad_norm": 0.6893802285194397, "kl": 0.75048828125, "learning_rate": 2.1675741910893278e-07, "loss": 0.0111, "num_tokens": 65212067.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 3856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 108.578125, "completions/mean_terminated_length": 108.578125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.703015534572038, "grad_norm": 0.21272358298301697, "kl": 0.39794921875, "learning_rate": 2.149979369204247e-07, "loss": 0.0159, "num_tokens": 65226424.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 111.609375, "completions/mean_terminated_length": 111.609375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 4.704233932378922, "grad_norm": 0.6092376708984375, "kl": 0.7509765625, "learning_rate": 2.132455473717765e-07, "loss": 0.0297, "num_tokens": 65240383.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 94.71875, "completions/mean_terminated_length": 94.71875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.705452330185806, "grad_norm": 0.841013491153717, "kl": 1.27685546875, "learning_rate": 2.115002517332043e-07, "loss": 0.0726, "num_tokens": 65253293.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 104.65625, "completions/mean_terminated_length": 104.65625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.706670727992689, "grad_norm": 0.197819784283638, "kl": 0.39990234375, "learning_rate": 2.0976205126978068e-07, "loss": 0.016, "num_tokens": 65266463.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 105.640625, "completions/mean_terminated_length": 105.640625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.707889125799573, "grad_norm": 1.0006157159805298, "kl": 1.78125, "learning_rate": 2.0803094724143879e-07, "loss": 0.1226, "num_tokens": 65280840.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 109.875, "completions/mean_terminated_length": 109.875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.709107523606457, "grad_norm": 0.21906791627407074, "kl": 0.37890625, "learning_rate": 2.063069409029661e-07, "loss": 0.0151, "num_tokens": 65295272.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 111.0, "completions/mean_terminated_length": 111.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.710325921413341, "grad_norm": 0.8268148303031921, "kl": 0.51025390625, "learning_rate": 2.0459003350400408e-07, "loss": 0.0223, "num_tokens": 65310440.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 103.453125, "completions/mean_terminated_length": 103.453125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.711544319220225, "grad_norm": 0.9275975227355957, "kl": 1.2421875, "learning_rate": 2.028802262890517e-07, "loss": -0.0086, "num_tokens": 65323997.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 93.609375, "completions/mean_terminated_length": 93.609375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.712762717027109, "grad_norm": 5.17544412612915, "kl": 1.13134765625, "learning_rate": 2.0117752049745642e-07, "loss": 0.0405, "num_tokens": 65336508.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 119.734375, "completions/mean_terminated_length": 119.734375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.713981114833993, "grad_norm": 1.0699388980865479, "kl": 2.3505859375, "learning_rate": 1.9948191736342327e-07, "loss": 0.1133, "num_tokens": 65351971.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 97.953125, "completions/mean_terminated_length": 97.953125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.715199512640877, "grad_norm": 0.6885457038879395, "kl": 1.1640625, "learning_rate": 1.9779341811600795e-07, "loss": 0.0589, "num_tokens": 65364664.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 98.59375, "completions/mean_terminated_length": 98.59375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 4.7164179104477615, "grad_norm": 1.1247642040252686, "kl": 1.6884765625, "learning_rate": 1.9611202397911366e-07, "loss": 0.0652, "num_tokens": 65377710.0, "reward": 1.90625, "reward_std": 0.21884362399578094, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 3868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 94.90625, "completions/mean_terminated_length": 94.90625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 4.7176363082546455, "grad_norm": 2.306532621383667, "kl": 4.00439453125, "learning_rate": 1.9443773617149665e-07, "loss": 0.2226, "num_tokens": 65390608.0, "reward": 1.90234375, "reward_std": 0.2762135863304138, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.21445617079734802, "step": 3869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 100.53125, "completions/mean_terminated_length": 100.53125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 4.71885470606153, "grad_norm": 0.8839141130447388, "kl": 2.05859375, "learning_rate": 1.9277055590676163e-07, "loss": 0.056, "num_tokens": 65403842.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 3870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 112.5625, "completions/mean_terminated_length": 112.5625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 4.720073103868413, "grad_norm": 1.3439226150512695, "kl": 1.9228515625, "learning_rate": 1.9111048439335978e-07, "loss": 0.0456, "num_tokens": 65418374.0, "reward": 1.8984375, "reward_std": 0.22621294856071472, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.17938801646232605, "step": 3871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 110.15625, "completions/mean_terminated_length": 110.15625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.721291501675297, "grad_norm": 0.521388053894043, "kl": 0.59912109375, "learning_rate": 1.8945752283459185e-07, "loss": 0.0229, "num_tokens": 65432728.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 100.5, "completions/mean_terminated_length": 100.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.722509899482181, "grad_norm": 1.021977186203003, "kl": 2.27001953125, "learning_rate": 1.8781167242860276e-07, "loss": 0.1128, "num_tokens": 65445872.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 103.703125, "completions/mean_terminated_length": 103.703125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.723728297289065, "grad_norm": 0.6573406457901001, "kl": 1.03515625, "learning_rate": 1.861729343683849e-07, "loss": 0.0687, "num_tokens": 65459021.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 95.625, "completions/mean_terminated_length": 95.625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.724946695095949, "grad_norm": 0.40920987725257874, "kl": 0.68017578125, "learning_rate": 1.8454130984177366e-07, "loss": 0.0043, "num_tokens": 65471565.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 100.875, "completions/mean_terminated_length": 100.875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.726165092902833, "grad_norm": 0.5468021631240845, "kl": 0.7021484375, "learning_rate": 1.8291680003145074e-07, "loss": 0.0059, "num_tokens": 65484757.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 111.109375, "completions/mean_terminated_length": 111.109375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.727383490709717, "grad_norm": 0.5730736255645752, "kl": 0.642578125, "learning_rate": 1.8129940611493756e-07, "loss": -0.0066, "num_tokens": 65499524.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 3877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 97.09375, "completions/mean_terminated_length": 97.09375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.728601888516601, "grad_norm": 1.2122973203659058, "kl": 1.93505859375, "learning_rate": 1.796891292645986e-07, "loss": 0.0995, "num_tokens": 65512762.0, "reward": 1.93359375, "reward_std": 0.1878252476453781, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 104.1875, "completions/mean_terminated_length": 104.1875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.729820286323484, "grad_norm": 0.6962583661079407, "kl": 0.8984375, "learning_rate": 1.7808597064764012e-07, "loss": 0.0136, "num_tokens": 65526614.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 100.34375, "completions/mean_terminated_length": 100.34375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.731038684130368, "grad_norm": 0.8552742004394531, "kl": 0.4755859375, "learning_rate": 1.7648993142610816e-07, "loss": 0.042, "num_tokens": 65539724.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 93.3125, "completions/mean_terminated_length": 93.3125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.732257081937252, "grad_norm": 0.9514443874359131, "kl": 0.63720703125, "learning_rate": 1.7490101275689064e-07, "loss": 0.0218, "num_tokens": 65552848.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 99.640625, "completions/mean_terminated_length": 99.640625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 4.733475479744136, "grad_norm": 0.6686466336250305, "kl": 1.1201171875, "learning_rate": 1.7331921579170963e-07, "loss": 0.0648, "num_tokens": 65566001.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 108.09375, "completions/mean_terminated_length": 108.09375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.73469387755102, "grad_norm": 1.0472418069839478, "kl": 2.12158203125, "learning_rate": 1.7174454167713016e-07, "loss": 0.0622, "num_tokens": 65580127.0, "reward": 1.91796875, "reward_std": 0.23201942443847656, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 105.328125, "completions/mean_terminated_length": 105.328125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.735912275357904, "grad_norm": 0.9879398941993713, "kl": 1.47021484375, "learning_rate": 1.7017699155454926e-07, "loss": 0.0479, "num_tokens": 65593948.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 3884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 116.265625, "completions/mean_terminated_length": 116.265625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.737130673164788, "grad_norm": 1.1261732578277588, "kl": 2.0400390625, "learning_rate": 1.6861656656020464e-07, "loss": 0.1015, "num_tokens": 65608573.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 100.265625, "completions/mean_terminated_length": 100.265625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.7383490709716725, "grad_norm": 0.7990170121192932, "kl": 1.10498046875, "learning_rate": 1.6706326782516603e-07, "loss": 0.0452, "num_tokens": 65621814.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 118.40625, "completions/mean_terminated_length": 118.40625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.7395674687785565, "grad_norm": 0.8168745040893555, "kl": 1.177734375, "learning_rate": 1.6551709647533942e-07, "loss": 0.0198, "num_tokens": 65636624.0, "reward": 1.93359375, "reward_std": 0.1878252476453781, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 103.8125, "completions/mean_terminated_length": 103.8125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.7407858665854405, "grad_norm": 1.4160653352737427, "kl": 2.91162109375, "learning_rate": 1.639780536314639e-07, "loss": 0.2309, "num_tokens": 65650420.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 127.28125, "completions/mean_terminated_length": 127.28125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 4.742004264392325, "grad_norm": 0.18712511658668518, "kl": 0.35693359375, "learning_rate": 1.6244614040911265e-07, "loss": 0.0143, "num_tokens": 65666806.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 105.09375, "completions/mean_terminated_length": 105.09375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.743222662199208, "grad_norm": 0.5253886580467224, "kl": 0.75634765625, "learning_rate": 1.6092135791868856e-07, "loss": -0.0014, "num_tokens": 65681092.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 90.203125, "completions/mean_terminated_length": 90.203125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.744441060006092, "grad_norm": 0.26752617955207825, "kl": 0.45751953125, "learning_rate": 1.5940370726542864e-07, "loss": 0.0183, "num_tokens": 65693497.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 101.984375, "completions/mean_terminated_length": 101.984375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.745659457812976, "grad_norm": 1.0436917543411255, "kl": 2.859375, "learning_rate": 1.5789318954939848e-07, "loss": 0.1929, "num_tokens": 65707160.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1118449866771698, "step": 3892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 86.609375, "completions/mean_terminated_length": 86.609375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.74687785561986, "grad_norm": 0.5028412938117981, "kl": 0.822265625, "learning_rate": 1.5638980586549336e-07, "loss": 0.0076, "num_tokens": 65719375.0, "reward": 1.9453125, "reward_std": 0.1431555151939392, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 104.1875, "completions/mean_terminated_length": 104.1875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.748096253426744, "grad_norm": 0.8683962225914001, "kl": 1.76171875, "learning_rate": 1.5489355730343935e-07, "loss": 0.0764, "num_tokens": 65733243.0, "reward": 1.95703125, "reward_std": 0.07999982684850693, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 3894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 98.25, "completions/mean_terminated_length": 98.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.749314651233628, "grad_norm": 0.20789065957069397, "kl": 0.42431640625, "learning_rate": 1.5340444494778784e-07, "loss": 0.017, "num_tokens": 65746539.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 105.234375, "completions/mean_terminated_length": 105.234375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 4.750533049040512, "grad_norm": 1.3260459899902344, "kl": 1.28369140625, "learning_rate": 1.519224698779198e-07, "loss": 0.0511, "num_tokens": 65760082.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 95.703125, "completions/mean_terminated_length": 95.703125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.751751446847396, "grad_norm": 0.6389672756195068, "kl": 0.71826171875, "learning_rate": 1.5044763316804267e-07, "loss": -0.0087, "num_tokens": 65772831.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 99.0, "completions/mean_terminated_length": 99.0, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 4.752969844654279, "grad_norm": 0.11110034584999084, "kl": 0.36328125, "learning_rate": 1.4897993588718684e-07, "loss": 0.0146, "num_tokens": 65785975.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 101.1875, "completions/mean_terminated_length": 101.1875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.754188242461163, "grad_norm": 0.32454463839530945, "kl": 0.564453125, "learning_rate": 1.4751937909921242e-07, "loss": 0.0093, "num_tokens": 65799139.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 109.359375, "completions/mean_terminated_length": 109.359375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.755406640268047, "grad_norm": 0.536110520362854, "kl": 0.55712890625, "learning_rate": 1.4606596386279925e-07, "loss": 0.0139, "num_tokens": 65813258.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 127.390625, "completions/mean_terminated_length": 127.390625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.756625038074931, "grad_norm": 1.0681941509246826, "kl": 1.77880859375, "learning_rate": 1.4461969123145458e-07, "loss": 0.0859, "num_tokens": 65829131.0, "reward": 1.96484375, "reward_std": 0.06563031673431396, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 96.0625, "completions/mean_terminated_length": 96.0625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.757843435881815, "grad_norm": 1.1421082019805908, "kl": 1.5517578125, "learning_rate": 1.431805622535043e-07, "loss": 0.0677, "num_tokens": 65842263.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 109.765625, "completions/mean_terminated_length": 109.765625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.759061833688699, "grad_norm": 1.0574158430099487, "kl": 1.544921875, "learning_rate": 1.4174857797209951e-07, "loss": 0.0915, "num_tokens": 65856288.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 119.40625, "completions/mean_terminated_length": 119.40625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 4.760280231495583, "grad_norm": 0.6525222659111023, "kl": 0.76123046875, "learning_rate": 1.4032373942520994e-07, "loss": 0.0248, "num_tokens": 65871410.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 113.765625, "completions/mean_terminated_length": 113.765625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 4.7614986293024675, "grad_norm": 2.2537028789520264, "kl": 3.99755859375, "learning_rate": 1.389060476456283e-07, "loss": 0.2345, "num_tokens": 65885803.0, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 117.703125, "completions/mean_terminated_length": 117.703125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.7627170271093515, "grad_norm": 0.2981604039669037, "kl": 0.4208984375, "learning_rate": 1.37495503660966e-07, "loss": 0.0168, "num_tokens": 65901104.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 128.84375, "completions/mean_terminated_length": 114.63492584228516, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 4.763935424916236, "grad_norm": 2.74812650680542, "kl": 7.27197265625, "learning_rate": 1.3609210849365172e-07, "loss": 0.519, "num_tokens": 65916566.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 94.015625, "completions/mean_terminated_length": 94.015625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.765153822723119, "grad_norm": 0.7436337471008301, "kl": 1.005859375, "learning_rate": 1.3469586316093518e-07, "loss": 0.0203, "num_tokens": 65929359.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 100.21875, "completions/mean_terminated_length": 100.21875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 4.766372220530003, "grad_norm": 0.8195116519927979, "kl": 1.19189453125, "learning_rate": 1.3330676867488123e-07, "loss": 0.0306, "num_tokens": 65942709.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 102.3125, "completions/mean_terminated_length": 102.3125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.767590618336887, "grad_norm": 0.734254002571106, "kl": 1.37158203125, "learning_rate": 1.3192482604237223e-07, "loss": 0.0309, "num_tokens": 65955801.0, "reward": 1.93359375, "reward_std": 0.18782523274421692, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 108.078125, "completions/mean_terminated_length": 108.078125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.768809016143771, "grad_norm": 1.1415404081344604, "kl": 1.38525390625, "learning_rate": 1.3055003626510687e-07, "loss": 0.0655, "num_tokens": 65969678.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 131.078125, "completions/mean_terminated_length": 131.078125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.770027413950655, "grad_norm": 0.616899847984314, "kl": 0.7666015625, "learning_rate": 1.2918240033960028e-07, "loss": 0.0268, "num_tokens": 65986915.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 105.078125, "completions/mean_terminated_length": 105.078125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 4.771245811757539, "grad_norm": 5.862179756164551, "kl": 2.92333984375, "learning_rate": 1.2782191925717836e-07, "loss": 0.1535, "num_tokens": 65999992.0, "reward": 1.90625, "reward_std": 0.21884362399578094, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 3913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 88.5625, "completions/mean_terminated_length": 88.5625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 4.772464209564423, "grad_norm": 1.1822210550308228, "kl": 1.736328125, "learning_rate": 1.2646859400398447e-07, "loss": 0.1109, "num_tokens": 66012132.0, "reward": 1.92578125, "reward_std": 0.16360090672969818, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 3914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 104.359375, "completions/mean_terminated_length": 104.359375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.773682607371307, "grad_norm": 1.0786939859390259, "kl": 0.63232421875, "learning_rate": 1.25122425560974e-07, "loss": 0.0491, "num_tokens": 66025427.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 89.90625, "completions/mean_terminated_length": 89.90625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.774901005178191, "grad_norm": 0.5760017037391663, "kl": 0.6884765625, "learning_rate": 1.23783414903913e-07, "loss": 0.0269, "num_tokens": 66037437.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 96.6875, "completions/mean_terminated_length": 96.6875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.776119402985074, "grad_norm": 1.1657596826553345, "kl": 2.1787109375, "learning_rate": 1.224515630033818e-07, "loss": 0.0669, "num_tokens": 66050601.0, "reward": 1.88671875, "reward_std": 0.3204077482223511, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.21445617079734802, "step": 3917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 117.859375, "completions/mean_terminated_length": 117.859375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.777337800791958, "grad_norm": 0.9412912130355835, "kl": 0.87255859375, "learning_rate": 1.211268708247715e-07, "loss": 0.0072, "num_tokens": 66066088.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 110.53125, "completions/mean_terminated_length": 110.53125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.778556198598842, "grad_norm": 1.419384241104126, "kl": 3.0322265625, "learning_rate": 1.1980933932828175e-07, "loss": 0.1393, "num_tokens": 66080186.0, "reward": 1.86328125, "reward_std": 0.32564985752105713, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18662993609905243, "step": 3919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 93.234375, "completions/mean_terminated_length": 93.234375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.779774596405726, "grad_norm": 0.31601977348327637, "kl": 0.43310546875, "learning_rate": 1.1849896946892426e-07, "loss": 0.0173, "num_tokens": 66092281.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 110.25, "completions/mean_terminated_length": 110.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.78099299421261, "grad_norm": 0.196449875831604, "kl": 0.33642578125, "learning_rate": 1.1719576219651585e-07, "loss": 0.0135, "num_tokens": 66106833.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 109.484375, "completions/mean_terminated_length": 109.484375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 4.782211392019494, "grad_norm": 0.9758482575416565, "kl": 0.81494140625, "learning_rate": 1.1589971845568427e-07, "loss": 0.0681, "num_tokens": 66120912.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 99.625, "completions/mean_terminated_length": 99.625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 4.7834297898263785, "grad_norm": 0.6849044561386108, "kl": 1.8291015625, "learning_rate": 1.1461083918586357e-07, "loss": 0.1551, "num_tokens": 66133912.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 85.515625, "completions/mean_terminated_length": 85.515625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 4.7846481876332625, "grad_norm": 0.9618244767189026, "kl": 1.130859375, "learning_rate": 1.1332912532129758e-07, "loss": 0.0353, "num_tokens": 66145609.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 112.546875, "completions/mean_terminated_length": 112.546875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.7858665854401465, "grad_norm": 3.4142582416534424, "kl": 3.7412109375, "learning_rate": 1.1205457779103313e-07, "loss": 0.2019, "num_tokens": 66160196.0, "reward": 1.90234375, "reward_std": 0.2537210285663605, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 3925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 96.875, "completions/mean_terminated_length": 96.875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 4.787084983247031, "grad_norm": 0.8980206251144409, "kl": 1.0419921875, "learning_rate": 1.1078719751892342e-07, "loss": -0.0085, "num_tokens": 66173020.0, "reward": 1.9140625, "reward_std": 0.18201877176761627, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 3926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 97.359375, "completions/mean_terminated_length": 97.359375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.788303381053914, "grad_norm": 0.7793219089508057, "kl": 1.06689453125, "learning_rate": 1.0952698542362805e-07, "loss": 0.0576, "num_tokens": 66186539.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 104.5, "completions/mean_terminated_length": 104.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.789521778860798, "grad_norm": 0.5925754308700562, "kl": 0.7734375, "learning_rate": 1.0827394241860634e-07, "loss": 0.0153, "num_tokens": 66200323.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 100.390625, "completions/mean_terminated_length": 100.390625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 4.790740176667682, "grad_norm": 0.7325735688209534, "kl": 1.23828125, "learning_rate": 1.0702806941212729e-07, "loss": 0.0403, "num_tokens": 66213500.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 122.3125, "completions/mean_terminated_length": 122.3125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.791958574474566, "grad_norm": 1.1856900453567505, "kl": 2.5078125, "learning_rate": 1.0578936730725631e-07, "loss": 0.104, "num_tokens": 66229744.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 3930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 100.71875, "completions/mean_terminated_length": 100.71875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 4.79317697228145, "grad_norm": 0.9344854950904846, "kl": 1.5224609375, "learning_rate": 1.0455783700186628e-07, "loss": 0.036, "num_tokens": 66242774.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 89.921875, "completions/mean_terminated_length": 89.921875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.794395370088334, "grad_norm": 0.5004689693450928, "kl": 0.53369140625, "learning_rate": 1.033334793886287e-07, "loss": -0.0018, "num_tokens": 66255137.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 109.5, "completions/mean_terminated_length": 109.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.795613767895218, "grad_norm": 0.5883986949920654, "kl": 0.42529296875, "learning_rate": 1.0211629535501811e-07, "loss": 0.0145, "num_tokens": 66269041.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 106.375, "completions/mean_terminated_length": 106.375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.796832165702102, "grad_norm": 1.4458178281784058, "kl": 1.423828125, "learning_rate": 1.0090628578330763e-07, "loss": 0.059, "num_tokens": 66282537.0, "reward": 1.92578125, "reward_std": 0.20992231369018555, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 3934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 109.765625, "completions/mean_terminated_length": 109.765625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.798050563508986, "grad_norm": 2.338016986846924, "kl": 3.3759765625, "learning_rate": 9.970345155056904e-08, "loss": 0.137, "num_tokens": 66296922.0, "reward": 1.90625, "reward_std": 0.2041158676147461, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 3935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 113.109375, "completions/mean_terminated_length": 113.109375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.799268961315869, "grad_norm": 1.5952800512313843, "kl": 2.466796875, "learning_rate": 9.85077935286749e-08, "loss": 0.0668, "num_tokens": 66311169.0, "reward": 1.90234375, "reward_std": 0.2762135863304138, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14683964848518372, "step": 3936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 97.203125, "completions/mean_terminated_length": 97.203125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.800487359122753, "grad_norm": 1.1101131439208984, "kl": 1.009765625, "learning_rate": 9.731931258429638e-08, "loss": 0.0482, "num_tokens": 66324142.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 111.34375, "completions/mean_terminated_length": 111.34375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.801705756929637, "grad_norm": 0.4189724028110504, "kl": 0.74169921875, "learning_rate": 9.613800957890218e-08, "loss": 0.0206, "num_tokens": 66339020.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 97.6875, "completions/mean_terminated_length": 97.6875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.802924154736521, "grad_norm": 0.9864413142204285, "kl": 1.44140625, "learning_rate": 9.496388536875623e-08, "loss": 0.0421, "num_tokens": 66352280.0, "reward": 1.93359375, "reward_std": 0.18782523274421692, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 3939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 91.765625, "completions/mean_terminated_length": 91.765625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.804142552543405, "grad_norm": 1.104511022567749, "kl": 2.119140625, "learning_rate": 9.379694080491996e-08, "loss": 0.0388, "num_tokens": 66364913.0, "reward": 1.90234375, "reward_std": 0.2762135863304138, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15344709157943726, "step": 3940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 92.15625, "completions/mean_terminated_length": 92.15625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 4.805360950350289, "grad_norm": 0.7186823487281799, "kl": 0.6376953125, "learning_rate": 9.263717673325124e-08, "loss": -0.005, "num_tokens": 66377611.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 120.765625, "completions/mean_terminated_length": 120.765625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 4.8065793481571735, "grad_norm": 2.314565658569336, "kl": 1.482421875, "learning_rate": 9.148459399440423e-08, "loss": 0.1518, "num_tokens": 66392588.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 3942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 101.5, "completions/mean_terminated_length": 101.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.8077977459640575, "grad_norm": 0.7073848247528076, "kl": 0.83984375, "learning_rate": 9.033919342382402e-08, "loss": 0.0282, "num_tokens": 66406220.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 99.109375, "completions/mean_terminated_length": 99.109375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.8090161437709416, "grad_norm": 0.8630446195602417, "kl": 0.431640625, "learning_rate": 8.920097585175314e-08, "loss": 0.045, "num_tokens": 66419275.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 128.96875, "completions/mean_terminated_length": 128.96875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 4.810234541577826, "grad_norm": 1.551652431488037, "kl": 0.978515625, "learning_rate": 8.806994210322606e-08, "loss": 0.075, "num_tokens": 66435609.0, "reward": 1.9453125, "reward_std": 0.15467961132526398, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 92.71875, "completions/mean_terminated_length": 92.71875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 4.811452939384709, "grad_norm": 1.2123610973358154, "kl": 1.6328125, "learning_rate": 8.694609299807143e-08, "loss": 0.0142, "num_tokens": 66448359.0, "reward": 1.87890625, "reward_std": 0.3425048589706421, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18662993609905243, "step": 3946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 112.421875, "completions/mean_terminated_length": 112.421875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.812671337191593, "grad_norm": 0.20445814728736877, "kl": 0.43408203125, "learning_rate": 8.582942935090877e-08, "loss": 0.0174, "num_tokens": 66462722.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 114.3125, "completions/mean_terminated_length": 114.3125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.813889734998477, "grad_norm": 3.3857579231262207, "kl": 3.01318359375, "learning_rate": 8.471995197114836e-08, "loss": 0.1916, "num_tokens": 66477158.0, "reward": 1.921875, "reward_std": 0.22097086906433105, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 3948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 106.78125, "completions/mean_terminated_length": 106.78125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 4.815108132805361, "grad_norm": 0.9521863460540771, "kl": 1.22119140625, "learning_rate": 8.361766166299356e-08, "loss": -0.0019, "num_tokens": 66491256.0, "reward": 1.91796875, "reward_std": 0.185698002576828, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 3949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 108.953125, "completions/mean_terminated_length": 108.953125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.816326530612245, "grad_norm": 1.1335405111312866, "kl": 0.83203125, "learning_rate": 8.252255922543751e-08, "loss": 0.0408, "num_tokens": 66505397.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 93.84375, "completions/mean_terminated_length": 93.84375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.817544928419129, "grad_norm": 1.0414100885391235, "kl": 1.26806640625, "learning_rate": 8.143464545226298e-08, "loss": 0.0405, "num_tokens": 66518283.0, "reward": 1.9296875, "reward_std": 0.15255236625671387, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 107.984375, "completions/mean_terminated_length": 107.984375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 4.818763326226013, "grad_norm": 0.11243178695440292, "kl": 0.4033203125, "learning_rate": 8.035392113204255e-08, "loss": 0.0161, "num_tokens": 66532050.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 102.734375, "completions/mean_terminated_length": 102.734375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.819981724032897, "grad_norm": 0.6979477405548096, "kl": 0.421875, "learning_rate": 7.928038704813623e-08, "loss": 0.0137, "num_tokens": 66545769.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 100.140625, "completions/mean_terminated_length": 100.140625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.82120012183978, "grad_norm": 4.051510334014893, "kl": 3.92822265625, "learning_rate": 7.82140439786927e-08, "loss": 0.2433, "num_tokens": 66558810.0, "reward": 1.87890625, "reward_std": 0.3043491244316101, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 3954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 91.8125, "completions/mean_terminated_length": 91.8125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.822418519646664, "grad_norm": 0.8578826189041138, "kl": 1.02392578125, "learning_rate": 7.715489269665033e-08, "loss": 0.0694, "num_tokens": 66571382.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 103.5625, "completions/mean_terminated_length": 103.5625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 4.823636917453548, "grad_norm": 0.2939687967300415, "kl": 0.5283203125, "learning_rate": 7.61029339697339e-08, "loss": 0.0014, "num_tokens": 66584946.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 92.578125, "completions/mean_terminated_length": 92.578125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.824855315260432, "grad_norm": 0.18539491295814514, "kl": 0.365234375, "learning_rate": 7.505816856045012e-08, "loss": 0.0146, "num_tokens": 66597559.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 107.53125, "completions/mean_terminated_length": 107.53125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.826073713067316, "grad_norm": 0.5694692730903625, "kl": 0.54150390625, "learning_rate": 7.402059722609655e-08, "loss": 0.0344, "num_tokens": 66611281.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 103.984375, "completions/mean_terminated_length": 103.984375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.8272921108742, "grad_norm": 0.28173500299453735, "kl": 0.4287109375, "learning_rate": 7.299022071875716e-08, "loss": 0.0172, "num_tokens": 66624736.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 98.484375, "completions/mean_terminated_length": 98.484375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 4.8285105086810844, "grad_norm": 0.8263930678367615, "kl": 1.09521484375, "learning_rate": 7.196703978529451e-08, "loss": -0.021, "num_tokens": 66637847.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 97.78125, "completions/mean_terminated_length": 97.78125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.8297289064879685, "grad_norm": 0.5963596701622009, "kl": 0.703125, "learning_rate": 7.095105516736201e-08, "loss": 0.0299, "num_tokens": 66650969.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 99.046875, "completions/mean_terminated_length": 99.046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 4.8309473042948525, "grad_norm": 2.4068729877471924, "kl": 2.04296875, "learning_rate": 6.994226760139389e-08, "loss": 0.063, "num_tokens": 66664236.0, "reward": 1.9453125, "reward_std": 0.10263003408908844, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 112.921875, "completions/mean_terminated_length": 112.921875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.832165702101737, "grad_norm": 0.5101301074028015, "kl": 0.7451171875, "learning_rate": 6.894067781860636e-08, "loss": 0.0025, "num_tokens": 66678639.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 92.875, "completions/mean_terminated_length": 92.875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.833384099908621, "grad_norm": 1.2267519235610962, "kl": 0.8466796875, "learning_rate": 6.79462865450009e-08, "loss": 0.0471, "num_tokens": 66691279.0, "reward": 1.9453125, "reward_std": 0.15467961132526398, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 105.59375, "completions/mean_terminated_length": 105.59375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.834602497715504, "grad_norm": 1.015566349029541, "kl": 1.33349609375, "learning_rate": 6.695909450136095e-08, "loss": 0.0554, "num_tokens": 66704917.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.07552725076675415, "step": 3965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 99.078125, "completions/mean_terminated_length": 99.078125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.835820895522388, "grad_norm": 3.1359164714813232, "kl": 4.55419921875, "learning_rate": 6.597910240324967e-08, "loss": 0.2056, "num_tokens": 66717866.0, "reward": 1.90625, "reward_std": 0.16080442070960999, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 3966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 105.453125, "completions/mean_terminated_length": 105.453125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.837039293329272, "grad_norm": 0.21479840576648712, "kl": 0.3779296875, "learning_rate": 6.500631096101218e-08, "loss": 0.0151, "num_tokens": 66731623.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 105.5, "completions/mean_terminated_length": 105.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 4.838257691136156, "grad_norm": 1.5228136777877808, "kl": 2.65283203125, "learning_rate": 6.404072087977554e-08, "loss": 0.1485, "num_tokens": 66745167.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 100.015625, "completions/mean_terminated_length": 100.015625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.83947608894304, "grad_norm": 0.19640925526618958, "kl": 0.43310546875, "learning_rate": 6.308233285944432e-08, "loss": 0.0173, "num_tokens": 66758224.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 98.453125, "completions/mean_terminated_length": 98.453125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.840694486749924, "grad_norm": 0.644632875919342, "kl": 0.58935546875, "learning_rate": 6.213114759470507e-08, "loss": 0.0197, "num_tokens": 66771245.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 99.484375, "completions/mean_terminated_length": 99.484375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 4.841912884556808, "grad_norm": 2.299846887588501, "kl": 2.60009765625, "learning_rate": 6.118716577502404e-08, "loss": 0.0706, "num_tokens": 66784692.0, "reward": 1.84765625, "reward_std": 0.3235225975513458, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.21704266965389252, "step": 3971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 89.75, "completions/mean_terminated_length": 89.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.843131282363692, "grad_norm": 0.9357838034629822, "kl": 0.77392578125, "learning_rate": 6.025038808464168e-08, "loss": 0.054, "num_tokens": 66796812.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 105.40625, "completions/mean_terminated_length": 105.40625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.844349680170575, "grad_norm": 1.197455883026123, "kl": 0.91845703125, "learning_rate": 5.9320815202582596e-08, "loss": 0.0125, "num_tokens": 66810718.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 102.859375, "completions/mean_terminated_length": 102.859375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.845568077977459, "grad_norm": 0.8584703803062439, "kl": 2.02001953125, "learning_rate": 5.839844780264336e-08, "loss": 0.0778, "num_tokens": 66824613.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 95.453125, "completions/mean_terminated_length": 95.453125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.846786475784343, "grad_norm": 0.14491821825504303, "kl": 0.3955078125, "learning_rate": 5.748328655340141e-08, "loss": 0.0158, "num_tokens": 66837578.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 129.015625, "completions/mean_terminated_length": 129.015625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.848004873591227, "grad_norm": 0.11538845300674438, "kl": 0.35498046875, "learning_rate": 5.6575332118209425e-08, "loss": 0.0142, "num_tokens": 66854419.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 3976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 104.84375, "completions/mean_terminated_length": 104.84375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.849223271398111, "grad_norm": 1.0628950595855713, "kl": 1.45703125, "learning_rate": 5.567458515519541e-08, "loss": 0.0748, "num_tokens": 66868753.0, "reward": 1.97265625, "reward_std": 0.07733980566263199, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 3977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 103.71875, "completions/mean_terminated_length": 103.71875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.850441669204995, "grad_norm": 1.8829939365386963, "kl": 5.14404296875, "learning_rate": 5.4781046317267103e-08, "loss": 0.3326, "num_tokens": 66882071.0, "reward": 1.890625, "reward_std": 0.26782506704330444, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 3978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 89.21875, "completions/mean_terminated_length": 89.21875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.8516600670118795, "grad_norm": 0.5972232818603516, "kl": 0.41796875, "learning_rate": 5.389471625210086e-08, "loss": 0.0118, "num_tokens": 66893933.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 97.265625, "completions/mean_terminated_length": 97.265625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.8528784648187635, "grad_norm": 0.8531050682067871, "kl": 0.65185546875, "learning_rate": 5.301559560215386e-08, "loss": 0.0403, "num_tokens": 66906574.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 96.0625, "completions/mean_terminated_length": 96.0625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.8540968626256475, "grad_norm": 2.035139322280884, "kl": 1.796875, "learning_rate": 5.214368500465305e-08, "loss": 0.0698, "num_tokens": 66919306.0, "reward": 1.9453125, "reward_std": 0.10263003408908844, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 3981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 93.265625, "completions/mean_terminated_length": 93.265625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.855315260432532, "grad_norm": 0.5584826469421387, "kl": 0.576171875, "learning_rate": 5.12789850916029e-08, "loss": 0.0144, "num_tokens": 66931747.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 112.328125, "completions/mean_terminated_length": 112.328125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.856533658239416, "grad_norm": 0.8119257092475891, "kl": 0.75244140625, "learning_rate": 5.042149648977756e-08, "loss": 0.0495, "num_tokens": 66946592.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 100.0625, "completions/mean_terminated_length": 100.0625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.857752056046299, "grad_norm": 0.521231472492218, "kl": 0.79443359375, "learning_rate": 4.957121982072766e-08, "loss": 0.0045, "num_tokens": 66960084.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 3984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 101.90625, "completions/mean_terminated_length": 101.90625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.858970453853183, "grad_norm": 0.47635817527770996, "kl": 0.4423828125, "learning_rate": 4.87281557007746e-08, "loss": 0.0223, "num_tokens": 66973654.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 3985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 115.609375, "completions/mean_terminated_length": 115.609375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.860188851660067, "grad_norm": 0.8597273230552673, "kl": 1.603515625, "learning_rate": 4.78923047410107e-08, "loss": 0.0387, "num_tokens": 66988461.0, "reward": 1.9140625, "reward_std": 0.24306795001029968, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 3986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 104.234375, "completions/mean_terminated_length": 104.234375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.861407249466951, "grad_norm": 2.604308843612671, "kl": 1.85546875, "learning_rate": 4.706366754730129e-08, "loss": 0.0343, "num_tokens": 67002300.0, "reward": 1.89453125, "reward_std": 0.29831066727638245, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18662993609905243, "step": 3987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 108.3125, "completions/mean_terminated_length": 108.3125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.862625647273835, "grad_norm": 1.2024939060211182, "kl": 2.025390625, "learning_rate": 4.624224472028149e-08, "loss": 0.0502, "num_tokens": 67016328.0, "reward": 1.921875, "reward_std": 0.11451567709445953, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 3988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 94.75, "completions/mean_terminated_length": 94.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 4.863844045080719, "grad_norm": 1.0216950178146362, "kl": 0.58251953125, "learning_rate": 4.542803685536057e-08, "loss": 0.0151, "num_tokens": 67029208.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 3989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 96.546875, "completions/mean_terminated_length": 96.546875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.865062442887603, "grad_norm": 0.4507007896900177, "kl": 0.6748046875, "learning_rate": 4.4621044542714206e-08, "loss": 0.0166, "num_tokens": 67042331.0, "reward": 1.9609375, "reward_std": 0.07232969254255295, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 101.25, "completions/mean_terminated_length": 101.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.866280840694487, "grad_norm": 1.6013461351394653, "kl": 3.26123046875, "learning_rate": 4.382126836728895e-08, "loss": 0.1499, "num_tokens": 67055387.0, "reward": 1.87890625, "reward_std": 0.24986201524734497, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18123632669448853, "step": 3991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 106.9375, "completions/mean_terminated_length": 106.9375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.86749923850137, "grad_norm": 2.3722572326660156, "kl": 1.267578125, "learning_rate": 4.3028708908801065e-08, "loss": 0.0512, "num_tokens": 67069679.0, "reward": 1.890625, "reward_std": 0.22449812293052673, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13729241490364075, "step": 3992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 102.859375, "completions/mean_terminated_length": 102.859375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.868717636308254, "grad_norm": 0.8374854922294617, "kl": 0.88916015625, "learning_rate": 4.2243366741735457e-08, "loss": 0.0252, "num_tokens": 67083134.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 3993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 109.5, "completions/mean_terminated_length": 109.5, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 4.869936034115138, "grad_norm": 1.4485129117965698, "kl": 2.76123046875, "learning_rate": 4.1465242435345665e-08, "loss": 0.1529, "num_tokens": 67097366.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09834947437047958, "step": 3994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 96.234375, "completions/mean_terminated_length": 96.234375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 4.871154431922022, "grad_norm": 0.7792390584945679, "kl": 0.97998046875, "learning_rate": 4.0694336553654955e-08, "loss": 0.0312, "num_tokens": 67110469.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 93.625, "completions/mean_terminated_length": 93.625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.872372829728906, "grad_norm": 0.9791873693466187, "kl": 1.091796875, "learning_rate": 3.99306496554519e-08, "loss": 0.034, "num_tokens": 67123149.0, "reward": 1.9453125, "reward_std": 0.15467961132526398, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 3996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 110.65625, "completions/mean_terminated_length": 110.65625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.87359122753579, "grad_norm": 1.5143539905548096, "kl": 1.55126953125, "learning_rate": 3.917418229429482e-08, "loss": 0.0831, "num_tokens": 67137519.0, "reward": 1.921875, "reward_std": 0.18281513452529907, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.06099375709891319, "step": 3997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 95.578125, "completions/mean_terminated_length": 95.578125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.8748096253426745, "grad_norm": 1.1061946153640747, "kl": 2.458984375, "learning_rate": 3.842493501850619e-08, "loss": 0.1306, "num_tokens": 67150356.0, "reward": 1.890625, "reward_std": 0.30935919284820557, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11356419324874878, "step": 3998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 111.96875, "completions/mean_terminated_length": 111.96875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.8760280231495585, "grad_norm": 1.8512822389602661, "kl": 2.68359375, "learning_rate": 3.768290837117605e-08, "loss": 0.1464, "num_tokens": 67164938.0, "reward": 1.875, "reward_std": 0.3120192289352417, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1044638603925705, "step": 3999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 103.75, "completions/mean_terminated_length": 103.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.877246420956443, "grad_norm": 0.4249907433986664, "kl": 0.54345703125, "learning_rate": 3.694810289016193e-08, "loss": 0.0072, "num_tokens": 67178834.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 4000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 106.578125, "completions/mean_terminated_length": 106.578125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.878464818763327, "grad_norm": 0.13120582699775696, "kl": 0.42333984375, "learning_rate": 3.622051910808666e-08, "loss": 0.0169, "num_tokens": 67192759.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 105.765625, "completions/mean_terminated_length": 105.765625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.879683216570211, "grad_norm": 1.4815564155578613, "kl": 1.27685546875, "learning_rate": 3.550015755233727e-08, "loss": 0.0165, "num_tokens": 67206488.0, "reward": 1.9296875, "reward_std": 0.15255236625671387, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 4002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 112.578125, "completions/mean_terminated_length": 112.578125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.880901614377094, "grad_norm": 0.5918959975242615, "kl": 1.10107421875, "learning_rate": 3.478701874506607e-08, "loss": 0.016, "num_tokens": 67221285.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 4003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 99.46875, "completions/mean_terminated_length": 99.46875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.882120012183978, "grad_norm": 1.4374006986618042, "kl": 1.2021484375, "learning_rate": 3.408110320319069e-08, "loss": 0.0041, "num_tokens": 67234539.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 4004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 95.046875, "completions/mean_terminated_length": 95.046875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.883338409990862, "grad_norm": 1.3265210390090942, "kl": 1.18896484375, "learning_rate": 3.3382411438392936e-08, "loss": -0.0013, "num_tokens": 67247414.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 4005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 108.078125, "completions/mean_terminated_length": 108.078125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.884556807797746, "grad_norm": 1.6612986326217651, "kl": 3.326171875, "learning_rate": 3.269094395711769e-08, "loss": 0.1542, "num_tokens": 67261555.0, "reward": 1.89453125, "reward_std": 0.2462611049413681, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.18662993609905243, "step": 4006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 135.046875, "completions/mean_terminated_length": 135.046875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.88577520560463, "grad_norm": 1.2699729204177856, "kl": 1.5078125, "learning_rate": 3.200670126057403e-08, "loss": 0.0817, "num_tokens": 67278854.0, "reward": 1.8984375, "reward_std": 0.2872621417045593, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.17938801646232605, "step": 4007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 116.0, "completions/mean_terminated_length": 116.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.886993603411514, "grad_norm": 0.3772834837436676, "kl": 0.537109375, "learning_rate": 3.132968384473412e-08, "loss": 0.0018, "num_tokens": 67293806.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 4008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 107.328125, "completions/mean_terminated_length": 107.328125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.888212001218398, "grad_norm": 0.926906943321228, "kl": 0.73779296875, "learning_rate": 3.065989220033205e-08, "loss": 0.0287, "num_tokens": 67307747.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 4009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 105.8125, "completions/mean_terminated_length": 105.8125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.889430399025282, "grad_norm": 1.1064248085021973, "kl": 0.6103515625, "learning_rate": 2.999732681286727e-08, "loss": 0.0112, "num_tokens": 67321687.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 4010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 107.328125, "completions/mean_terminated_length": 107.328125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 4.890648796832165, "grad_norm": 1.5089647769927979, "kl": 2.33837890625, "learning_rate": 2.9341988162595593e-08, "loss": 0.1295, "num_tokens": 67335284.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 4011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 96.1875, "completions/mean_terminated_length": 96.1875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.891867194639049, "grad_norm": 0.3605240285396576, "kl": 0.72216796875, "learning_rate": 2.869387672454038e-08, "loss": 0.0382, "num_tokens": 67348288.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 4012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 99.9375, "completions/mean_terminated_length": 99.9375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.893085592445933, "grad_norm": 0.4348835051059723, "kl": 0.75, "learning_rate": 2.8052992968482517e-08, "loss": 0.0343, "num_tokens": 67361812.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 4013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 93.828125, "completions/mean_terminated_length": 93.828125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 4.894303990252817, "grad_norm": 0.16989025473594666, "kl": 0.35205078125, "learning_rate": 2.7419337358967068e-08, "loss": 0.0141, "num_tokens": 67374417.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 113.390625, "completions/mean_terminated_length": 113.390625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.895522388059701, "grad_norm": 0.2079843431711197, "kl": 0.419921875, "learning_rate": 2.679291035529663e-08, "loss": 0.0168, "num_tokens": 67388762.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 91.109375, "completions/mean_terminated_length": 91.109375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.8967407858665855, "grad_norm": 1.2393263578414917, "kl": 2.83056640625, "learning_rate": 2.6173712411536878e-08, "loss": 0.2176, "num_tokens": 67401153.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 4016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 107.6875, "completions/mean_terminated_length": 107.6875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.8979591836734695, "grad_norm": 0.8632946610450745, "kl": 1.1572265625, "learning_rate": 2.556174397651101e-08, "loss": 0.0613, "num_tokens": 67415477.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 4017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 94.140625, "completions/mean_terminated_length": 94.140625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.8991775814803535, "grad_norm": 0.6210291385650635, "kl": 0.59765625, "learning_rate": 2.4957005493803087e-08, "loss": 0.0356, "num_tokens": 67428126.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 104.84375, "completions/mean_terminated_length": 104.84375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.900395979287238, "grad_norm": 1.1964771747589111, "kl": 2.1005859375, "learning_rate": 2.4359497401758026e-08, "loss": 0.1248, "num_tokens": 67441436.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08768405020236969, "step": 4019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 103.484375, "completions/mean_terminated_length": 103.484375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.901614377094122, "grad_norm": 0.47915148735046387, "kl": 0.65185546875, "learning_rate": 2.3769220133477156e-08, "loss": 0.0249, "num_tokens": 67454691.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 4020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 110.125, "completions/mean_terminated_length": 110.125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 4.902832774901006, "grad_norm": 1.7223702669143677, "kl": 5.22314453125, "learning_rate": 2.318617411682156e-08, "loss": 0.4107, "num_tokens": 67468803.0, "reward": 1.91015625, "reward_std": 0.2541164755821228, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1188335195183754, "step": 4021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 101.046875, "completions/mean_terminated_length": 101.046875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 4.904051172707889, "grad_norm": 1.602758526802063, "kl": 0.77734375, "learning_rate": 2.2610359774412062e-08, "loss": -0.0068, "num_tokens": 67482174.0, "reward": 1.92578125, "reward_std": 0.20992231369018555, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 4022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 110.890625, "completions/mean_terminated_length": 110.890625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 4.905269570514773, "grad_norm": 0.8719689846038818, "kl": 0.97900390625, "learning_rate": 2.2041777523627018e-08, "loss": -0.0038, "num_tokens": 67496415.0, "reward": 1.91796875, "reward_std": 0.23201942443847656, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 4023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 99.65625, "completions/mean_terminated_length": 99.65625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.906487968321657, "grad_norm": 0.5578979849815369, "kl": 0.80029296875, "learning_rate": 2.1480427776600088e-08, "loss": 0.0168, "num_tokens": 67509449.0, "reward": 1.94140625, "reward_std": 0.1275724172592163, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05326050892472267, "step": 4024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 104.578125, "completions/mean_terminated_length": 104.578125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.907706366128541, "grad_norm": 0.4407762587070465, "kl": 0.97607421875, "learning_rate": 2.09263109402269e-08, "loss": 0.0301, "num_tokens": 67523294.0, "reward": 1.97265625, "reward_std": 0.07733980566263199, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0937500074505806, "step": 4025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 122.65625, "completions/mean_terminated_length": 122.65625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 4.908924763935425, "grad_norm": 0.6801420450210571, "kl": 0.9365234375, "learning_rate": 2.037942741615617e-08, "loss": 0.01, "num_tokens": 67538960.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 4026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 95.859375, "completions/mean_terminated_length": 95.859375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.910143161742309, "grad_norm": 1.4581317901611328, "kl": 0.76806640625, "learning_rate": 1.9839777600796363e-08, "loss": 0.0397, "num_tokens": 67551951.0, "reward": 1.92578125, "reward_std": 0.20992231369018555, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1416819840669632, "step": 4027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 119.703125, "completions/mean_terminated_length": 119.703125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.911361559549193, "grad_norm": 1.5133756399154663, "kl": 3.0947265625, "learning_rate": 1.9307361885311238e-08, "loss": 0.1644, "num_tokens": 67567692.0, "reward": 1.91796875, "reward_std": 0.23201940953731537, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.17743313312530518, "step": 4028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 100.140625, "completions/mean_terminated_length": 100.140625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.912579957356077, "grad_norm": 1.2028878927230835, "kl": 2.24560546875, "learning_rate": 1.8782180655622096e-08, "loss": 0.131, "num_tokens": 67580773.0, "reward": 1.97265625, "reward_std": 0.07733980566263199, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 4029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 87.109375, "completions/mean_terminated_length": 87.109375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.91379835516296, "grad_norm": 1.6235462427139282, "kl": 2.923828125, "learning_rate": 1.8264234292403316e-08, "loss": 0.1249, "num_tokens": 67592676.0, "reward": 1.90625, "reward_std": 0.18139132857322693, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 4030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 113.34375, "completions/mean_terminated_length": 113.34375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 4.915016752969844, "grad_norm": 0.6525903344154358, "kl": 0.4892578125, "learning_rate": 1.775352317109014e-08, "loss": 0.0062, "num_tokens": 67607226.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 4031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 119.15625, "completions/mean_terminated_length": 119.15625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.916235150776728, "grad_norm": 0.9852108955383301, "kl": 1.05517578125, "learning_rate": 1.7250047661868664e-08, "loss": 0.0232, "num_tokens": 67622420.0, "reward": 1.9453125, "reward_std": 0.15467961132526398, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 4032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 113.53125, "completions/mean_terminated_length": 113.53125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.917453548583612, "grad_norm": 1.2030166387557983, "kl": 3.05712890625, "learning_rate": 1.6753808129682525e-08, "loss": 0.1585, "num_tokens": 67637094.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 4033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 95.15625, "completions/mean_terminated_length": 95.15625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 4.918671946390496, "grad_norm": 0.7262980341911316, "kl": 1.287109375, "learning_rate": 1.626480493423066e-08, "loss": 0.0208, "num_tokens": 67649904.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 4034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 101.25, "completions/mean_terminated_length": 101.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.9198903441973805, "grad_norm": 1.1266744136810303, "kl": 1.126953125, "learning_rate": 1.5783038429965093e-08, "loss": 0.0497, "num_tokens": 67662936.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 4035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 105.296875, "completions/mean_terminated_length": 105.296875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.9211087420042645, "grad_norm": 1.3249499797821045, "kl": 3.67333984375, "learning_rate": 1.530850896609426e-08, "loss": 0.2644, "num_tokens": 67676907.0, "reward": 1.93359375, "reward_std": 0.18782523274421692, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10259227454662323, "step": 4036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 94.28125, "completions/mean_terminated_length": 94.28125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 4.9223271398111486, "grad_norm": 3.5393736362457275, "kl": 2.26171875, "learning_rate": 1.484121688657969e-08, "loss": 0.0527, "num_tokens": 67689421.0, "reward": 1.921875, "reward_std": 0.22097086906433105, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1574852019548416, "step": 4037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 90.515625, "completions/mean_terminated_length": 90.515625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.923545537618033, "grad_norm": 2.691197633743286, "kl": 2.154296875, "learning_rate": 1.4381162530135995e-08, "loss": 0.0234, "num_tokens": 67702070.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 4038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 108.4375, "completions/mean_terminated_length": 108.4375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 4.924763935424917, "grad_norm": 1.0431113243103027, "kl": 2.138671875, "learning_rate": 1.3928346230234203e-08, "loss": 0.1385, "num_tokens": 67716570.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 4039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 95.171875, "completions/mean_terminated_length": 95.171875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.925982333231801, "grad_norm": 0.16618101298809052, "kl": 0.4013671875, "learning_rate": 1.3482768315097316e-08, "loss": 0.0161, "num_tokens": 67729493.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 99.671875, "completions/mean_terminated_length": 99.671875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 4.927200731038684, "grad_norm": 1.2988252639770508, "kl": 3.56591796875, "learning_rate": 1.3044429107700319e-08, "loss": 0.1686, "num_tokens": 67742784.0, "reward": 1.9140625, "reward_std": 0.24306795001029968, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.18483558297157288, "step": 4041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 98.109375, "completions/mean_terminated_length": 98.109375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.928419128845568, "grad_norm": 0.7250789403915405, "kl": 1.67431640625, "learning_rate": 1.2613328925773493e-08, "loss": 0.1133, "num_tokens": 67755207.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 4042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 91.578125, "completions/mean_terminated_length": 91.578125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.929637526652452, "grad_norm": 1.8023170232772827, "kl": 3.5283203125, "learning_rate": 1.2189468081799104e-08, "loss": 0.2046, "num_tokens": 67767452.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1550549566745758, "step": 4043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 108.6875, "completions/mean_terminated_length": 108.6875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.930855924459336, "grad_norm": 1.16653311252594, "kl": 2.8857421875, "learning_rate": 1.177284688301139e-08, "loss": 0.2154, "num_tokens": 67781136.0, "reward": 1.9453125, "reward_std": 0.15467959642410278, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13886408507823944, "step": 4044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 93.5, "completions/mean_terminated_length": 93.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 4.93207432226622, "grad_norm": 0.7155551910400391, "kl": 1.755859375, "learning_rate": 1.1363465631397675e-08, "loss": 0.0996, "num_tokens": 67793880.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1550549566745758, "step": 4045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 101.734375, "completions/mean_terminated_length": 101.734375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.933292720073104, "grad_norm": 2.3226332664489746, "kl": 0.9765625, "learning_rate": 1.0961324623697256e-08, "loss": 0.0128, "num_tokens": 67807327.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 4046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 99.453125, "completions/mean_terminated_length": 99.453125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.934511117879988, "grad_norm": 0.17540259659290314, "kl": 0.41845703125, "learning_rate": 1.0566424151401412e-08, "loss": 0.0168, "num_tokens": 67820188.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 94.5625, "completions/mean_terminated_length": 94.5625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.935729515686872, "grad_norm": 1.1657365560531616, "kl": 1.00732421875, "learning_rate": 1.017876450075228e-08, "loss": 0.0116, "num_tokens": 67832752.0, "reward": 1.93359375, "reward_std": 0.18782523274421692, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 4048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 98.578125, "completions/mean_terminated_length": 98.578125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.936947913493755, "grad_norm": 0.9666550159454346, "kl": 0.78662109375, "learning_rate": 9.798345952743981e-09, "loss": 0.0211, "num_tokens": 67846109.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 85.234375, "completions/mean_terminated_length": 85.234375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.938166311300639, "grad_norm": 0.6693783402442932, "kl": 0.37744140625, "learning_rate": 9.425168783123716e-09, "loss": 0.0028, "num_tokens": 67858180.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 4050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 122.078125, "completions/mean_terminated_length": 122.078125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 4.939384709107523, "grad_norm": 0.13202302157878876, "kl": 0.36083984375, "learning_rate": 9.059233262386225e-09, "loss": 0.0144, "num_tokens": 67874145.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 104.34375, "completions/mean_terminated_length": 104.34375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 4.940603106914407, "grad_norm": 0.162319153547287, "kl": 0.384765625, "learning_rate": 8.700539655781548e-09, "loss": 0.0154, "num_tokens": 67887655.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 111.375, "completions/mean_terminated_length": 111.375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.9418215047212914, "grad_norm": 0.6460691690444946, "kl": 0.64453125, "learning_rate": 8.349088223306157e-09, "loss": 0.0394, "num_tokens": 67902495.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 4053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 112.4375, "completions/mean_terminated_length": 112.4375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.9430399025281755, "grad_norm": 1.9032917022705078, "kl": 2.267578125, "learning_rate": 8.004879219709605e-09, "loss": 0.1071, "num_tokens": 67916843.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 4054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 95.703125, "completions/mean_terminated_length": 95.703125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.9442583003350595, "grad_norm": 0.7910595536231995, "kl": 1.3408203125, "learning_rate": 7.667912894491204e-09, "loss": 0.0259, "num_tokens": 67929472.0, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 4055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 101.375, "completions/mean_terminated_length": 101.375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 4.945476698141944, "grad_norm": 2.3914902210235596, "kl": 6.41845703125, "learning_rate": 7.338189491900016e-09, "loss": 0.4011, "num_tokens": 67942304.0, "reward": 1.875, "reward_std": 0.30723196268081665, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1883249133825302, "step": 4056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 107.953125, "completions/mean_terminated_length": 107.953125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 4.946695095948828, "grad_norm": 1.0328965187072754, "kl": 2.44775390625, "learning_rate": 7.015709250935976e-09, "loss": 0.1447, "num_tokens": 67956277.0, "reward": 1.921875, "reward_std": 0.22097085416316986, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1510545015335083, "step": 4057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 102.375, "completions/mean_terminated_length": 102.375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 4.947913493755712, "grad_norm": 0.1790996491909027, "kl": 0.4462890625, "learning_rate": 6.700472405346548e-09, "loss": 0.0179, "num_tokens": 67969757.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 99.921875, "completions/mean_terminated_length": 99.921875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.949131891562595, "grad_norm": 1.071929693222046, "kl": 0.97216796875, "learning_rate": 6.392479183633394e-09, "loss": 0.0532, "num_tokens": 67983192.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 100.4375, "completions/mean_terminated_length": 100.4375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 4.950350289369479, "grad_norm": 1.0670355558395386, "kl": 1.447265625, "learning_rate": 6.091729809042379e-09, "loss": 0.0487, "num_tokens": 67996628.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 4060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 92.390625, "completions/mean_terminated_length": 92.390625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.951568687176363, "grad_norm": 0.88810795545578, "kl": 1.51025390625, "learning_rate": 5.798224499572458e-09, "loss": 0.0477, "num_tokens": 68009333.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06943204253911972, "step": 4061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 103.453125, "completions/mean_terminated_length": 103.453125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.952787084983247, "grad_norm": 0.16897989809513092, "kl": 0.39453125, "learning_rate": 5.5119634679701166e-09, "loss": 0.0158, "num_tokens": 68023034.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 97.515625, "completions/mean_terminated_length": 97.515625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.954005482790131, "grad_norm": 0.7122454643249512, "kl": 1.5302734375, "learning_rate": 5.232946921730486e-09, "loss": 0.1138, "num_tokens": 68036147.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 4063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 81.4375, "completions/mean_terminated_length": 81.4375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.955223880597015, "grad_norm": 0.7520949840545654, "kl": 1.1240234375, "learning_rate": 4.961175063099566e-09, "loss": 0.0395, "num_tokens": 68047743.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 4064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 101.828125, "completions/mean_terminated_length": 101.828125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.956442278403899, "grad_norm": 0.5717635154724121, "kl": 0.4267578125, "learning_rate": 4.696648089068667e-09, "loss": 0.0152, "num_tokens": 68061332.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 4065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 103.71875, "completions/mean_terminated_length": 103.71875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.957660676210783, "grad_norm": 0.9904782772064209, "kl": 2.310546875, "learning_rate": 4.4393661913810785e-09, "loss": 0.1017, "num_tokens": 68074778.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 4066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 94.0, "completions/mean_terminated_length": 94.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.958879074017667, "grad_norm": 0.7162137031555176, "kl": 0.70849609375, "learning_rate": 4.189329556527622e-09, "loss": 0.0331, "num_tokens": 68087090.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 4067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 103.34375, "completions/mean_terminated_length": 103.34375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 4.96009747182455, "grad_norm": 0.916812002658844, "kl": 1.13427734375, "learning_rate": 3.946538365744435e-09, "loss": 0.0406, "num_tokens": 68100560.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 4068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 89.890625, "completions/mean_terminated_length": 89.890625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.961315869631434, "grad_norm": 0.8463624715805054, "kl": 1.2255859375, "learning_rate": 3.7109927950207403e-09, "loss": 0.0822, "num_tokens": 68112913.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 4069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 111.0, "completions/mean_terminated_length": 111.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 4.962534267438318, "grad_norm": 0.4057583510875702, "kl": 0.693359375, "learning_rate": 3.4826930150899662e-09, "loss": 0.0012, "num_tokens": 68127321.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 4070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 88.6875, "completions/mean_terminated_length": 88.6875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 4.963752665245202, "grad_norm": 0.31804871559143066, "kl": 0.5400390625, "learning_rate": 3.2616391914364056e-09, "loss": -0.0097, "num_tokens": 68139013.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 4071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 105.671875, "completions/mean_terminated_length": 105.671875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.9649710630520865, "grad_norm": 0.129403755068779, "kl": 0.36279296875, "learning_rate": 3.0478314842874444e-09, "loss": 0.0145, "num_tokens": 68152800.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 100.515625, "completions/mean_terminated_length": 100.515625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 4.9661894608589705, "grad_norm": 0.3553412854671478, "kl": 0.435546875, "learning_rate": 2.8412700486235566e-09, "loss": 0.0174, "num_tokens": 68166425.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 96.328125, "completions/mean_terminated_length": 96.328125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.9674078586658545, "grad_norm": 0.49105730652809143, "kl": 0.42529296875, "learning_rate": 2.641955034170529e-09, "loss": 0.017, "num_tokens": 68179022.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 108.109375, "completions/mean_terminated_length": 108.109375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.968626256472739, "grad_norm": 0.7301739454269409, "kl": 1.40234375, "learning_rate": 2.449886585400574e-09, "loss": 0.0694, "num_tokens": 68193245.0, "reward": 1.96484375, "reward_std": 0.09943689405918121, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 4075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 104.015625, "completions/mean_terminated_length": 104.015625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 4.969844654279623, "grad_norm": 0.636073112487793, "kl": 1.7548828125, "learning_rate": 2.2650648415334374e-09, "loss": 0.122, "num_tokens": 68206478.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 4076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 106.5, "completions/mean_terminated_length": 106.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 4.971063052086507, "grad_norm": 0.5509372353553772, "kl": 0.57275390625, "learning_rate": 2.0874899365386225e-09, "loss": 0.0297, "num_tokens": 68220190.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 4077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 107.6875, "completions/mean_terminated_length": 107.6875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 4.97228144989339, "grad_norm": 1.0658012628555298, "kl": 1.38037109375, "learning_rate": 1.917161999128725e-09, "loss": 0.0825, "num_tokens": 68234546.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 4078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 103.75, "completions/mean_terminated_length": 103.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 4.973499847700274, "grad_norm": 1.1061044931411743, "kl": 1.40576171875, "learning_rate": 1.7540811527683166e-09, "loss": 0.075, "num_tokens": 68248098.0, "reward": 1.9453125, "reward_std": 0.15467961132526398, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 4079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 105.984375, "completions/mean_terminated_length": 105.984375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 4.974718245507158, "grad_norm": 0.5303924679756165, "kl": 0.73193359375, "learning_rate": 1.5982475156639532e-09, "loss": 0.0022, "num_tokens": 68262177.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 4080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 97.75, "completions/mean_terminated_length": 97.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 4.975936643314042, "grad_norm": 0.6523662805557251, "kl": 0.4541015625, "learning_rate": 1.449661200773056e-09, "loss": 0.015, "num_tokens": 68275617.0, "reward": 1.9609375, "reward_std": 0.07232969254255295, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 4081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 92.875, "completions/mean_terminated_length": 92.875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.977155041120926, "grad_norm": 0.6750864386558533, "kl": 1.0400390625, "learning_rate": 1.3083223157972503e-09, "loss": 0.0154, "num_tokens": 68288121.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 4082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 94.3125, "completions/mean_terminated_length": 94.3125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 4.97837343892781, "grad_norm": 0.6124765276908875, "kl": 0.7880859375, "learning_rate": 1.1742309631845861e-09, "loss": 0.0548, "num_tokens": 68300621.0, "reward": 1.9609375, "reward_std": 0.07232969254255295, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043842025101184845, "step": 4083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 103.859375, "completions/mean_terminated_length": 103.859375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 4.979591836734694, "grad_norm": 0.8255395293235779, "kl": 1.15966796875, "learning_rate": 1.047387240132869e-09, "loss": 0.0088, "num_tokens": 68314412.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 4084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 114.171875, "completions/mean_terminated_length": 114.171875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.980810234541578, "grad_norm": 1.0700806379318237, "kl": 1.85107421875, "learning_rate": 9.277912385841081e-10, "loss": 0.0785, "num_tokens": 68328919.0, "reward": 1.91015625, "reward_std": 0.16925540566444397, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 4085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 93.390625, "completions/mean_terminated_length": 93.390625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 4.982028632348462, "grad_norm": 0.8645685315132141, "kl": 1.21240234375, "learning_rate": 8.154430452267381e-10, "loss": 0.0225, "num_tokens": 68341544.0, "reward": 1.921875, "reward_std": 0.15992169082164764, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 4086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 109.953125, "completions/mean_terminated_length": 109.953125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.983247030155345, "grad_norm": 0.14806857705116272, "kl": 0.3564453125, "learning_rate": 7.103427414967279e-10, "loss": 0.0142, "num_tokens": 68355925.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 95.859375, "completions/mean_terminated_length": 95.859375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 4.984465427962229, "grad_norm": 0.1395443230867386, "kl": 0.4013671875, "learning_rate": 6.124904035742507e-10, "loss": 0.016, "num_tokens": 68368676.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 104.453125, "completions/mean_terminated_length": 104.453125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 4.985683825769113, "grad_norm": 0.8682887554168701, "kl": 1.24609375, "learning_rate": 5.218861023892352e-10, "loss": 0.0925, "num_tokens": 68382297.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 4089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 102.828125, "completions/mean_terminated_length": 102.828125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 4.986902223575997, "grad_norm": 1.6723623275756836, "kl": 1.849609375, "learning_rate": 4.3852990361581414e-10, "loss": 0.0245, "num_tokens": 68395974.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.21304203569889069, "step": 4090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 110.0625, "completions/mean_terminated_length": 110.0625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.9881206213828815, "grad_norm": 0.7328651547431946, "kl": 1.2509765625, "learning_rate": 3.624218676734348e-10, "loss": 0.0463, "num_tokens": 68410306.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 4091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 87.46875, "completions/mean_terminated_length": 87.46875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 4.9893390191897655, "grad_norm": 0.13773079216480255, "kl": 0.41357421875, "learning_rate": 2.935620497301894e-10, "loss": 0.0166, "num_tokens": 68422024.0, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 118.84375, "completions/mean_terminated_length": 118.84375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 4.99055741699665, "grad_norm": 1.2157151699066162, "kl": 1.54833984375, "learning_rate": 2.3195049969837458e-10, "loss": 0.0668, "num_tokens": 68437622.0, "reward": 1.9140625, "reward_std": 0.24306795001029968, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13152606785297394, "step": 4093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 103.90625, "completions/mean_terminated_length": 103.90625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 4.991775814803534, "grad_norm": 1.4557603597640991, "kl": 1.55224609375, "learning_rate": 1.775872622356012e-10, "loss": 0.0472, "num_tokens": 68451504.0, "reward": 1.94921875, "reward_std": 0.14363107085227966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1283649355173111, "step": 4094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 86.328125, "completions/mean_terminated_length": 86.328125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 4.992994212610418, "grad_norm": 0.7420077919960022, "kl": 0.85498046875, "learning_rate": 1.3047237674923553e-10, "loss": 0.0355, "num_tokens": 68463269.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 4095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 108.546875, "completions/mean_terminated_length": 108.546875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 4.994212610417302, "grad_norm": 1.1670269966125488, "kl": 1.302734375, "learning_rate": 9.06058773897378e-11, "loss": 0.1095, "num_tokens": 68477592.0, "reward": 1.9609375, "reward_std": 0.11048543453216553, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0625, "step": 4096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 105.5625, "completions/mean_terminated_length": 105.5625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 4.995431008224185, "grad_norm": 0.42395102977752686, "kl": 0.59814453125, "learning_rate": 5.798779305399294e-11, "loss": 0.0129, "num_tokens": 68491780.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125, "step": 4097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 89.734375, "completions/mean_terminated_length": 89.734375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 4.996649406031069, "grad_norm": 0.6872451305389404, "kl": 0.53076171875, "learning_rate": 3.2618147384200213e-11, "loss": 0.0191, "num_tokens": 68504307.0, "reward": 1.91015625, "reward_std": 0.156504288315773, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13449780642986298, "step": 4098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 111.0625, "completions/mean_terminated_length": 111.0625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 4.997867803837953, "grad_norm": 0.49931663274765015, "kl": 0.6416015625, "learning_rate": 1.449695877120405e-11, "loss": 0.0003, "num_tokens": 68518639.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.125, "step": 4099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 97.56757354736328, "completions/mean_terminated_length": 97.56757354736328, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 4.999086201644837, "grad_norm": 0.7030074000358582, "kl": 1.35595703125, "learning_rate": 3.624240350053043e-12, "loss": 0.0251, "num_tokens": 68532353.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/ppl_reward/mean": 0.0, "rewards/ppl_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.17536810040473938, "step": 4100 }, { "epoch": 4.999086201644837, "step": 4100, "total_flos": 0.0, "train_loss": 0.13987601281700246, "train_runtime": 35127.6818, "train_samples_per_second": 0.934, "train_steps_per_second": 0.117 } ], "logging_steps": 1, "max_steps": 4100, "num_input_tokens_seen": 68532353, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }