diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -2,9 +2,9 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.0009612981370042105, - "eval_steps": 50, - "global_step": 50, + "epoch": 0.004996152962219091, + "eval_steps": 250, + "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -15,27 +15,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.875, + "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, - "completions/max_terminated_length": 6.0, - "completions/mean_length": 224.75, - "completions/mean_terminated_length": 6.0, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, - "epoch": 1.922596274008421e-05, - "grad_norm": 0.9570798277854919, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 246.0, + "completions/mean_terminated_length": 216.0, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 1.9984611848876364e-05, + "grad_norm": 0.35291337966918945, "kl": 0.0, "learning_rate": 0.0, - "loss": 0.0, - "num_tokens": 3538.0, - "reward": 60.62403869628906, - "reward_std": 88.23416137695312, - "rewards/keyword_presence_reward/mean": 0.375, + "loss": -0.0, + "num_tokens": 4068.0, + "reward": 91.08512878417969, + "reward_std": 60.305015563964844, + "rewards/keyword_presence_reward/mean": 0.625, "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 12.5, - "rewards/reward_keyword_presence/std": 18.898223876953125, - "rewards/sentence_structure_reward/mean": 0.005150997545570135, - "rewards/sentence_structure_reward/std": 0.002145076636224985, + "rewards/reward_keyword_presence/mean": 18.75, + "rewards/reward_keyword_presence/std": 17.677669525146484, + "rewards/sentence_structure_reward/mean": 0.007333379238843918, + "rewards/sentence_structure_reward/std": 0.0034788085613399744, "step": 1 }, { @@ -46,25 +46,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, - "completions/max_terminated_length": 219.0, - "completions/mean_length": 251.375, - "completions/mean_terminated_length": 219.0, - "completions/min_length": 219.0, - "completions/min_terminated_length": 219.0, - "epoch": 3.845192548016842e-05, - "grad_norm": 0.37707242369651794, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 241.625, + "completions/mean_terminated_length": 141.0, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 3.996922369775273e-05, + "grad_norm": 0.42894917726516724, "kl": 0.0, "learning_rate": 4.000000000000001e-06, - "loss": -0.0001, - "num_tokens": 6917.0, - "reward": 106.61274719238281, - "reward_std": 30.455976486206055, - "rewards/keyword_presence_reward/mean": 0.875, - "rewards/keyword_presence_reward/std": 0.3535533845424652, - "rewards/reward_keyword_presence/mean": 21.875, - "rewards/reward_keyword_presence/std": 8.838834762573242, - "rewards/sentence_structure_reward/mean": 0.007429969497025013, - "rewards/sentence_structure_reward/std": 0.002213152125477791, + "loss": -0.0, + "num_tokens": 8253.0, + "reward": 30.49214744567871, + "reward_std": 35.167728424072266, + "rewards/keyword_presence_reward/mean": 0.25, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 6.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.008176153525710106, + "rewards/sentence_structure_reward/std": 0.0027330925222486258, "step": 2 }, { @@ -73,27 +73,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.875, + "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, - "completions/max_terminated_length": 238.0, - "completions/mean_length": 253.75, - "completions/mean_terminated_length": 238.0, - "completions/min_length": 238.0, - "completions/min_terminated_length": 238.0, - "epoch": 5.767788822025263e-05, - "grad_norm": 0.5458233952522278, - "kl": 0.0004711461369879544, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 5.99538355466291e-05, + "grad_norm": 0.40051016211509705, + "kl": 0.0003794784133788198, "learning_rate": 8.000000000000001e-06, "loss": 0.0, - "num_tokens": 10639.0, - "reward": 30.49321746826172, - "reward_std": 60.896018981933594, - "rewards/keyword_presence_reward/mean": 0.25, - "rewards/keyword_presence_reward/std": 0.4629100561141968, - "rewards/reward_keyword_presence/mean": 6.25, - "rewards/reward_keyword_presence/std": 11.57275104522705, - "rewards/sentence_structure_reward/mean": 0.008382699452340603, - "rewards/sentence_structure_reward/std": 0.0015127100050449371, + "num_tokens": 12413.0, + "reward": 60.94150161743164, + "reward_std": 60.88746643066406, + "rewards/keyword_presence_reward/mean": 0.5, + "rewards/keyword_presence_reward/std": 0.5345224738121033, + "rewards/reward_keyword_presence/mean": 12.5, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.008093032985925674, + "rewards/sentence_structure_reward/std": 0.003198219696059823, "step": 3 }, { @@ -104,25 +104,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, - "completions/max_terminated_length": 126.0, - "completions/mean_length": 239.75, - "completions/mean_terminated_length": 126.0, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, - "epoch": 7.690385096033684e-05, - "grad_norm": 0.3946276307106018, - "kl": 0.0005535021555260755, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 254.25, + "completions/mean_terminated_length": 242.0, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 7.993844739550546e-05, + "grad_norm": 0.3775780498981476, + "kl": 0.0005384029354900122, "learning_rate": 1.2e-05, "loss": 0.0, - "num_tokens": 14309.0, - "reward": 75.87001037597656, - "reward_std": 92.9371337890625, - "rewards/keyword_presence_reward/mean": 0.5, - "rewards/keyword_presence_reward/std": 0.5345224738121033, - "rewards/reward_keyword_presence/mean": 15.625, - "rewards/reward_keyword_presence/std": 18.600595474243164, - "rewards/sentence_structure_reward/mean": 0.009218699298799038, - "rewards/sentence_structure_reward/std": 0.004998977296054363, + "num_tokens": 15683.0, + "reward": 91.08601379394531, + "reward_std": 88.22401428222656, + "rewards/keyword_presence_reward/mean": 0.625, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 18.75, + "rewards/reward_keyword_presence/std": 17.677669525146484, + "rewards/sentence_structure_reward/mean": 0.007503539323806763, + "rewards/sentence_structure_reward/std": 0.0018910289509221911, "step": 4 }, { @@ -138,20 +138,20 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 9.612981370042105e-05, - "grad_norm": 0.3297968804836273, - "kl": 0.0004953827010467649, + "epoch": 9.992305924438183e-05, + "grad_norm": 0.38194945454597473, + "kl": 0.0005026419530622661, "learning_rate": 1.6000000000000003e-05, "loss": 0.0, - "num_tokens": 17941.0, - "reward": 30.483320236206055, - "reward_std": 60.90174102783203, - "rewards/keyword_presence_reward/mean": 0.25, - "rewards/keyword_presence_reward/std": 0.4629100561141968, - "rewards/reward_keyword_presence/mean": 6.25, - "rewards/reward_keyword_presence/std": 11.57275104522705, - "rewards/sentence_structure_reward/mean": 0.006472452078014612, - "rewards/sentence_structure_reward/std": 0.001417727442458272, + "num_tokens": 19471.0, + "reward": 136.46142578125, + "reward_std": 79.07363891601562, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.008071230724453926, + "rewards/sentence_structure_reward/std": 0.002038340549916029, "step": 5 }, { @@ -162,25 +162,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, - "completions/max_terminated_length": 240.0, - "completions/mean_length": 254.0, - "completions/mean_terminated_length": 240.0, - "completions/min_length": 240.0, - "completions/min_terminated_length": 240.0, - "epoch": 0.00011535577644050526, - "grad_norm": 0.390165239572525, - "kl": 0.0005446467112051323, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 224.75, + "completions/mean_terminated_length": 6.0, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.0001199076710932582, + "grad_norm": 2.1737723350524902, + "kl": 0.0006963407795410603, "learning_rate": 2e-05, "loss": 0.0, - "num_tokens": 21105.0, - "reward": 75.85758972167969, - "reward_std": 79.68409729003906, - "rewards/keyword_presence_reward/mean": 0.5, - "rewards/keyword_presence_reward/std": 0.5345224738121033, - "rewards/reward_keyword_presence/mean": 15.625, - "rewards/reward_keyword_presence/std": 18.600595474243164, - "rewards/sentence_structure_reward/mean": 0.006822699680924416, - "rewards/sentence_structure_reward/std": 0.0014274070272222161, + "num_tokens": 22717.0, + "reward": 30.4832820892334, + "reward_std": 35.17006301879883, + "rewards/keyword_presence_reward/mean": 0.25, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 6.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.006464952602982521, + "rewards/sentence_structure_reward/std": 0.0036425672005861998, "step": 6 }, { @@ -189,27 +189,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 1.0, + "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, - "completions/max_terminated_length": 0.0, - "completions/mean_length": 256.0, - "completions/mean_terminated_length": 0.0, - "completions/min_length": 256.0, - "completions/min_terminated_length": 0.0, - "epoch": 0.00013458173918058948, - "grad_norm": 0.43533363938331604, - "kl": 0.0005274449940770864, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 250.75, + "completions/mean_terminated_length": 214.0, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.00013989228294213456, + "grad_norm": 0.3898351788520813, + "kl": 0.0004418441385496408, "learning_rate": 2.4e-05, "loss": 0.0, - "num_tokens": 26013.0, - "reward": 30.48623275756836, - "reward_std": 35.162193298339844, - "rewards/keyword_presence_reward/mean": 0.25, - "rewards/keyword_presence_reward/std": 0.4629100561141968, - "rewards/reward_keyword_presence/mean": 6.25, - "rewards/reward_keyword_presence/std": 11.57275104522705, - "rewards/sentence_structure_reward/mean": 0.007034247275441885, - "rewards/sentence_structure_reward/std": 0.0009639360359869897, + "num_tokens": 26147.0, + "reward": 45.71279525756836, + "reward_std": 65.60282897949219, + "rewards/keyword_presence_reward/mean": 0.375, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 9.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.007357113063335419, + "rewards/sentence_structure_reward/std": 0.0019852188415825367, "step": 7 }, { @@ -218,27 +218,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.75, + "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, - "completions/max_terminated_length": 172.0, - "completions/mean_length": 228.5, - "completions/mean_terminated_length": 146.0, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, - "epoch": 0.00015380770192067368, - "grad_norm": 0.3855687081813812, - "kl": 0.0006293374208325986, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 241.25, + "completions/mean_terminated_length": 138.0, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.0001598768947910109, + "grad_norm": 0.3169557750225067, + "kl": 0.00045419411617331207, "learning_rate": 2.8e-05, "loss": 0.0, - "num_tokens": 29293.0, - "reward": 45.7190055847168, - "reward_std": 65.608154296875, - "rewards/keyword_presence_reward/mean": 0.375, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 9.375, - "rewards/reward_keyword_presence/std": 12.938729286193848, - "rewards/sentence_structure_reward/mean": 0.00855495035648346, - "rewards/sentence_structure_reward/std": 0.004264699295163155, + "num_tokens": 29649.0, + "reward": 30.479446411132812, + "reward_std": 60.906715393066406, + "rewards/keyword_presence_reward/mean": 0.25, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 6.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.005724778864532709, + "rewards/sentence_structure_reward/std": 0.003959107678383589, "step": 8 }, { @@ -247,27 +247,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 1.0, + "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, - "completions/max_terminated_length": 0.0, - "completions/mean_length": 256.0, - "completions/mean_terminated_length": 0.0, - "completions/min_length": 256.0, - "completions/min_terminated_length": 0.0, - "epoch": 0.0001730336646607579, - "grad_norm": 0.44653329253196716, - "kl": 0.0009850938222371042, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 244.25, + "completions/mean_terminated_length": 162.0, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.0001798615066398873, + "grad_norm": 0.36395329236984253, + "kl": 0.0008125847525661811, "learning_rate": 3.2000000000000005e-05, "loss": 0.0, - "num_tokens": 33057.0, - "reward": 75.86065673828125, - "reward_std": 92.93635559082031, - "rewards/keyword_presence_reward/mean": 0.5, - "rewards/keyword_presence_reward/std": 0.5345224738121033, - "rewards/reward_keyword_presence/mean": 15.625, - "rewards/reward_keyword_presence/std": 18.600595474243164, - "rewards/sentence_structure_reward/mean": 0.00741471815854311, - "rewards/sentence_structure_reward/std": 0.001862735254690051, + "num_tokens": 33379.0, + "reward": 106.3137435913086, + "reward_std": 64.99724578857422, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.008051094599068165, + "rewards/sentence_structure_reward/std": 0.003721932414919138, "step": 9 }, { @@ -276,27 +276,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.75, + "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, - "completions/max_terminated_length": 181.0, - "completions/mean_length": 228.25, - "completions/mean_terminated_length": 145.0, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, - "epoch": 0.0001922596274008421, - "grad_norm": 0.357486754655838, - "kl": 0.0007498150298488326, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 250.0, + "completions/mean_terminated_length": 208.0, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.00019984611848876367, + "grad_norm": 0.3852463662624359, + "kl": 0.0010140962403966114, "learning_rate": 3.6e-05, "loss": 0.0, - "num_tokens": 36451.0, - "reward": 136.4630889892578, - "reward_std": 79.07745361328125, - "rewards/keyword_presence_reward/mean": 0.875, - "rewards/keyword_presence_reward/std": 0.3535533845424652, - "rewards/reward_keyword_presence/mean": 28.125, - "rewards/reward_keyword_presence/std": 16.02174949645996, - "rewards/sentence_structure_reward/mean": 0.008394160307943821, - "rewards/sentence_structure_reward/std": 0.003010717686265707, + "num_tokens": 36487.0, + "reward": 45.70966339111328, + "reward_std": 65.61338806152344, + "rewards/keyword_presence_reward/mean": 0.375, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 9.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.0067517999559640884, + "rewards/sentence_structure_reward/std": 0.001707072020508349, "step": 10 }, { @@ -305,27 +305,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.625, + "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, - "completions/max_terminated_length": 179.0, - "completions/mean_length": 193.875, - "completions/mean_terminated_length": 90.33333587646484, - "completions/min_length": 29.0, - "completions/min_terminated_length": 29.0, - "epoch": 0.0002114855901409263, - "grad_norm": 1.1071817874908447, - "kl": 0.0018053226813208312, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 222.125, + "completions/mean_terminated_length": 120.5, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.00021983073033764002, + "grad_norm": 0.751896858215332, + "kl": 0.0030199767788872123, "learning_rate": 4e-05, "loss": 0.0001, - "num_tokens": 39654.0, - "reward": 30.498306274414062, - "reward_std": 35.1629524230957, + "num_tokens": 40016.0, + "reward": 30.49408721923828, + "reward_std": 60.921348571777344, "rewards/keyword_presence_reward/mean": 0.25, "rewards/keyword_presence_reward/std": 0.4629100561141968, "rewards/reward_keyword_presence/mean": 6.25, "rewards/reward_keyword_presence/std": 11.57275104522705, - "rewards/sentence_structure_reward/mean": 0.009364679455757141, - "rewards/sentence_structure_reward/std": 0.006163181271404028, + "rewards/sentence_structure_reward/mean": 0.008550537750124931, + "rewards/sentence_structure_reward/std": 0.00558843370527029, "step": 11 }, { @@ -334,29 +334,58 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.75, + "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, - "completions/max_terminated_length": 193.0, - "completions/mean_length": 239.25, - "completions/mean_terminated_length": 189.0, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, - "epoch": 0.0002307115528810105, - "grad_norm": 0.3807227909564972, - "kl": 0.0013619658784591593, - "learning_rate": 3.9938346674662565e-05, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 246.875, + "completions/mean_terminated_length": 183.0, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.0002398153421865164, + "grad_norm": 0.45209842920303345, + "kl": 0.002819138753693551, + "learning_rate": 3.9999588939167326e-05, "loss": 0.0001, - "num_tokens": 42744.0, - "reward": 30.18766975402832, - "reward_std": 60.302677154541016, - "rewards/keyword_presence_reward/mean": 0.125, - "rewards/keyword_presence_reward/std": 0.3535533845424652, - "rewards/reward_keyword_presence/mean": 6.25, - "rewards/reward_keyword_presence/std": 17.677669525146484, - "rewards/sentence_structure_reward/mean": 0.007740631699562073, - "rewards/sentence_structure_reward/std": 0.002558372216299176, + "num_tokens": 43667.0, + "reward": 45.71592712402344, + "reward_std": 65.61399841308594, + "rewards/keyword_presence_reward/mean": 0.375, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 9.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.00796138122677803, + "rewards/sentence_structure_reward/std": 0.0016098901396617293, "step": 12 }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 255.875, + "completions/mean_terminated_length": 255.0, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.00025979995403539274, + "grad_norm": 0.3511572778224945, + "kl": 0.001639918585169653, + "learning_rate": 3.999835577356639e-05, + "loss": 0.0001, + "num_tokens": 48550.0, + "reward": 60.63811111450195, + "reward_std": 88.22590637207031, + "rewards/keyword_presence_reward/mean": 0.375, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 12.5, + "rewards/reward_keyword_presence/std": 18.898223876953125, + "rewards/sentence_structure_reward/mean": 0.007867585867643356, + "rewards/sentence_structure_reward/std": 0.002370655769482255, + "step": 13 + }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, @@ -365,26 +394,26 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, - "completions/max_terminated_length": 252.0, - "completions/mean_length": 231.5, - "completions/mean_terminated_length": 158.0, - "completions/min_length": 64.0, - "completions/min_terminated_length": 64.0, - "epoch": 0.00024993751562109475, - "grad_norm": 0.44279101490974426, - "kl": 0.002703821344766766, - "learning_rate": 3.9753766811902756e-05, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 237.5, + "completions/mean_terminated_length": 182.0, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.0002797845658842691, + "grad_norm": 0.3840062618255615, + "kl": 0.0026373607688583434, + "learning_rate": 3.99963005538878e-05, "loss": 0.0001, - "num_tokens": 46728.0, - "reward": 30.503662109375, - "reward_std": 35.171470642089844, + "num_tokens": 52394.0, + "reward": 30.49520492553711, + "reward_std": 60.889793395996094, "rewards/keyword_presence_reward/mean": 0.25, "rewards/keyword_presence_reward/std": 0.4629100561141968, "rewards/reward_keyword_presence/mean": 6.25, "rewards/reward_keyword_presence/std": 11.57275104522705, - "rewards/sentence_structure_reward/mean": 0.010398203507065773, - "rewards/sentence_structure_reward/std": 0.006824915297329426, - "step": 13 + "rewards/sentence_structure_reward/mean": 0.008766276761889458, + "rewards/sentence_structure_reward/std": 0.002746498677879572, + "step": 14 }, { "clip_ratio/high_max": 0.0, @@ -392,28 +421,28 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 1.0, + "completions/clipped_ratio": 0.625, "completions/max_length": 256.0, - "completions/max_terminated_length": 0.0, - "completions/mean_length": 256.0, - "completions/mean_terminated_length": 0.0, - "completions/min_length": 256.0, - "completions/min_terminated_length": 0.0, - "epoch": 0.00026916347836117895, - "grad_norm": 0.6306744813919067, - "kl": 0.002872352139092982, - "learning_rate": 3.9447398407953536e-05, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 219.875, + "completions/mean_terminated_length": 159.6666717529297, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.0002997691777331455, + "grad_norm": 0.4022621512413025, + "kl": 0.002699694479815662, + "learning_rate": 3.99934233646136e-05, "loss": 0.0001, - "num_tokens": 50644.0, - "reward": 105.70105743408203, - "reward_std": 90.74132537841797, + "num_tokens": 55421.0, + "reward": 75.86482238769531, + "reward_std": 79.6762924194336, "rewards/keyword_presence_reward/mean": 0.5, "rewards/keyword_presence_reward/std": 0.5345224738121033, - "rewards/reward_keyword_presence/mean": 21.875, - "rewards/reward_keyword_presence/std": 24.775779724121094, - "rewards/sentence_structure_reward/mean": 0.006459136493504047, - "rewards/sentence_structure_reward/std": 0.0014907469740137458, - "step": 14 + "rewards/reward_keyword_presence/mean": 15.625, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.0082187969237566, + "rewards/sentence_structure_reward/std": 0.003897033166140318, + "step": 15 }, { "clip_ratio/high_max": 0.0, @@ -428,49 +457,20 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.00028838944110126316, - "grad_norm": 0.37339434027671814, - "kl": 0.0039220866456162184, - "learning_rate": 3.9021130325903076e-05, - "loss": 0.0002, - "num_tokens": 54092.0, - "reward": 76.15447998046875, - "reward_std": 65.60916137695312, - "rewards/keyword_presence_reward/mean": 0.625, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 15.625, - "rewards/reward_keyword_presence/std": 12.938729286193848, - "rewards/sentence_structure_reward/mean": 0.005792803131043911, - "rewards/sentence_structure_reward/std": 0.0004974025068804622, - "step": 15 - }, - { - "clip_ratio/high_max": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.75, - "completions/max_length": 256.0, - "completions/max_terminated_length": 170.0, - "completions/mean_length": 230.125, - "completions/mean_terminated_length": 152.5, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, - "epoch": 0.00030761540384134737, - "grad_norm": 0.4536955654621124, - "kl": 0.0045144540490582585, - "learning_rate": 3.8477590650225735e-05, + "epoch": 0.0003197537895820218, + "grad_norm": 0.4078104496002197, + "kl": 0.004393383278511465, + "learning_rate": 3.998972432401376e-05, "loss": 0.0002, - "num_tokens": 56829.0, - "reward": 105.70284271240234, - "reward_std": 139.36866760253906, - "rewards/keyword_presence_reward/mean": 0.5, - "rewards/keyword_presence_reward/std": 0.5345224738121033, - "rewards/reward_keyword_presence/mean": 21.875, - "rewards/reward_keyword_presence/std": 28.149791717529297, - "rewards/sentence_structure_reward/mean": 0.0068035246804356575, - "rewards/sentence_structure_reward/std": 0.0019828490912914276, + "num_tokens": 59297.0, + "reward": 136.15341186523438, + "reward_std": 106.90872192382812, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 20.863075256347656, + "rewards/sentence_structure_reward/mean": 0.006957118399441242, + "rewards/sentence_structure_reward/std": 0.0009539236780256033, "step": 16 }, { @@ -486,20 +486,20 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0003268413665814316, - "grad_norm": 0.3723878860473633, - "kl": 0.0016764590400271118, - "learning_rate": 3.782013048376736e-05, + "epoch": 0.0003397384014308982, + "grad_norm": 0.4322844445705414, + "kl": 0.0027627036906778812, + "learning_rate": 3.998520358414136e-05, "loss": 0.0001, - "num_tokens": 61649.0, - "reward": 45.418212890625, - "reward_std": 57.785423278808594, - "rewards/keyword_presence_reward/mean": 0.25, - "rewards/keyword_presence_reward/std": 0.4629100561141968, - "rewards/reward_keyword_presence/mean": 9.375, - "rewards/reward_keyword_presence/std": 18.600595474243164, - "rewards/sentence_structure_reward/mean": 0.008830500766634941, - "rewards/sentence_structure_reward/std": 0.0027567746583372355, + "num_tokens": 63921.0, + "reward": 106.61506652832031, + "reward_std": 30.457597732543945, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.007878022268414497, + "rewards/sentence_structure_reward/std": 0.002755282912403345, "step": 17 }, { @@ -508,27 +508,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.875, + "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, - "completions/max_terminated_length": 56.0, - "completions/mean_length": 231.0, - "completions/mean_terminated_length": 56.0, - "completions/min_length": 56.0, - "completions/min_terminated_length": 56.0, - "epoch": 0.0003460673293215158, - "grad_norm": 0.41773512959480286, - "kl": 0.004305432550609112, - "learning_rate": 3.705280328708185e-05, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0003597230132797746, + "grad_norm": 0.2982736825942993, + "kl": 0.0046585101808886975, + "learning_rate": 3.9979861330826295e-05, "loss": 0.0002, - "num_tokens": 64973.0, - "reward": 45.744075775146484, - "reward_std": 30.39153480529785, - "rewards/keyword_presence_reward/mean": 0.375, + "num_tokens": 68069.0, + "reward": 120.9271469116211, + "reward_std": 115.45002746582031, + "rewards/keyword_presence_reward/mean": 0.625, "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 9.375, - "rewards/reward_keyword_presence/std": 12.938729286193848, - "rewards/sentence_structure_reward/mean": 0.013393515720963478, - "rewards/sentence_structure_reward/std": 0.015977438539266586, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 23.1455020904541, + "rewards/sentence_structure_reward/mean": 0.006690750829875469, + "rewards/sentence_structure_reward/std": 0.0021281519439071417, "step": 18 }, { @@ -537,27 +537,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 1.0, + "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, - "completions/max_terminated_length": 0.0, - "completions/mean_length": 256.0, - "completions/mean_terminated_length": 0.0, - "completions/min_length": 256.0, - "completions/min_terminated_length": 0.0, - "epoch": 0.0003652932920616, - "grad_norm": 0.31560006737709045, - "kl": 0.003967506752815098, - "learning_rate": 3.6180339887498953e-05, - "loss": 0.0002, - "num_tokens": 69377.0, - "reward": 91.39585876464844, - "reward_std": 35.17618179321289, - "rewards/keyword_presence_reward/mean": 0.75, - "rewards/keyword_presence_reward/std": 0.4629100561141968, - "rewards/reward_keyword_presence/mean": 18.75, - "rewards/reward_keyword_presence/std": 11.57275104522705, - "rewards/sentence_structure_reward/mean": 0.008975526317954063, - "rewards/sentence_structure_reward/std": 0.002985145663842559, + "completions/max_terminated_length": 96.0, + "completions/mean_length": 236.0, + "completions/mean_terminated_length": 96.0, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.00037970762512865095, + "grad_norm": 0.32902616262435913, + "kl": 0.002009416406508535, + "learning_rate": 3.997369778366769e-05, + "loss": 0.0001, + "num_tokens": 71741.0, + "reward": 75.85345458984375, + "reward_std": 90.75408935546875, + "rewards/keyword_presence_reward/mean": 0.5, + "rewards/keyword_presence_reward/std": 0.5345224738121033, + "rewards/reward_keyword_presence/mean": 15.625, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.006024259142577648, + "rewards/sentence_structure_reward/std": 0.004815916996449232, "step": 19 }, { @@ -568,25 +568,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, - "completions/max_terminated_length": 140.0, - "completions/mean_length": 241.5, - "completions/mean_terminated_length": 140.0, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, - "epoch": 0.0003845192548016842, - "grad_norm": 0.39019882678985596, - "kl": 0.00409153540385887, - "learning_rate": 3.520811931200063e-05, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 254.625, + "completions/mean_terminated_length": 245.0, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.00039969223697752733, + "grad_norm": 0.3680703639984131, + "kl": 0.005848537781275809, + "learning_rate": 3.9966713196024824e-05, "loss": 0.0002, - "num_tokens": 72885.0, - "reward": 60.6353874206543, - "reward_std": 49.237728118896484, - "rewards/keyword_presence_reward/mean": 0.375, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 12.5, - "rewards/reward_keyword_presence/std": 18.898223876953125, - "rewards/sentence_structure_reward/mean": 0.007341583259403706, - "rewards/sentence_structure_reward/std": 0.0023867818526923656, + "num_tokens": 75342.0, + "reward": 90.77346801757812, + "reward_std": 88.1178207397461, + "rewards/keyword_presence_reward/mean": 0.5, + "rewards/keyword_presence_reward/std": 0.5345224738121033, + "rewards/reward_keyword_presence/mean": 18.75, + "rewards/reward_keyword_presence/std": 22.160131454467773, + "rewards/sentence_structure_reward/mean": 0.005511060822755098, + "rewards/sentence_structure_reward/std": 0.0029544285498559475, "step": 20 }, { @@ -597,25 +597,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, - "completions/max_terminated_length": 226.0, - "completions/mean_length": 252.25, - "completions/mean_terminated_length": 226.0, - "completions/min_length": 226.0, - "completions/min_terminated_length": 226.0, - "epoch": 0.0004037452175417684, - "grad_norm": 0.41628676652908325, - "kl": 0.007002327387453988, - "learning_rate": 3.4142135623730954e-05, - "loss": 0.0003, - "num_tokens": 76559.0, - "reward": 75.86404418945312, - "reward_std": 79.67518615722656, - "rewards/keyword_presence_reward/mean": 0.5, - "rewards/keyword_presence_reward/std": 0.5345224738121033, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 251.375, + "completions/mean_terminated_length": 219.0, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.00041967684882640365, + "grad_norm": 0.42022237181663513, + "kl": 0.0034152381704188883, + "learning_rate": 3.995890785500673e-05, + "loss": -0.0003, + "num_tokens": 78425.0, + "reward": 76.15742492675781, + "reward_std": 30.45340347290039, + "rewards/keyword_presence_reward/mean": 0.625, + "rewards/keyword_presence_reward/std": 0.5175492167472839, "rewards/reward_keyword_presence/mean": 15.625, - "rewards/reward_keyword_presence/std": 18.600595474243164, - "rewards/sentence_structure_reward/mean": 0.008066326379776001, - "rewards/sentence_structure_reward/std": 0.0018108647782355547, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.0063628037460148335, + "rewards/sentence_structure_reward/std": 0.0011169032659381628, "step": 21 }, { @@ -631,21 +631,5850 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0004229711802818526, - "grad_norm": 0.4388905167579651, - "kl": 0.0069318081077653915, - "learning_rate": 3.298896096660367e-05, - "loss": -0.0, - "num_tokens": 80327.0, - "reward": 91.38612365722656, - "reward_std": 35.160011291503906, - "rewards/keyword_presence_reward/mean": 0.75, - "rewards/keyword_presence_reward/std": 0.4629100561141968, - "rewards/reward_keyword_presence/mean": 18.75, - "rewards/reward_keyword_presence/std": 11.57275104522705, - "rewards/sentence_structure_reward/mean": 0.0070976316928863525, - "rewards/sentence_structure_reward/std": 0.00217603612691164, - "step": 22 + "epoch": 0.00043966146067528003, + "grad_norm": 0.3547864854335785, + "kl": 0.0033517318370286375, + "learning_rate": 3.995028208146042e-05, + "loss": 0.0001, + "num_tokens": 82305.0, + "reward": 15.263712882995605, + "reward_std": 30.450775146484375, + "rewards/keyword_presence_reward/mean": 0.125, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 3.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.007492370903491974, + "rewards/sentence_structure_reward/std": 0.0013515567407011986, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 253.75, + "completions/mean_terminated_length": 238.0, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.0004596460725241564, + "grad_norm": 0.43193894624710083, + "kl": 0.00343302427791059, + "learning_rate": 3.994083622995764e-05, + "loss": 0.0001, + "num_tokens": 85347.0, + "reward": 30.486154556274414, + "reward_std": 35.163700103759766, + "rewards/keyword_presence_reward/mean": 0.25, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 6.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.007019665092229843, + "rewards/sentence_structure_reward/std": 0.0014217470306903124, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 256.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 223.5, + "completions/mean_terminated_length": 169.33334350585938, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.0004796306843730328, + "grad_norm": 0.4179834723472595, + "kl": 0.004458332012291066, + "learning_rate": 3.993057068878037e-05, + "loss": 0.0002, + "num_tokens": 88287.0, + "reward": 91.08903503417969, + "reward_std": 88.22101593017578, + "rewards/keyword_presence_reward/mean": 0.625, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 18.75, + "rewards/reward_keyword_presence/std": 17.677669525146484, + "rewards/sentence_structure_reward/mean": 0.00808693841099739, + "rewards/sentence_structure_reward/std": 0.004810389596968889, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0004996152962219091, + "grad_norm": 0.32437941431999207, + "kl": 0.0024505460169166327, + "learning_rate": 3.991948587990479e-05, + "loss": 0.0001, + "num_tokens": 92291.0, + "reward": 30.482154846191406, + "reward_std": 60.898765563964844, + "rewards/keyword_presence_reward/mean": 0.25, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 6.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.006247351877391338, + "rewards/sentence_structure_reward/std": 0.0008982332656159997, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 245.625, + "completions/mean_terminated_length": 173.0, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.0005195999080707855, + "grad_norm": 0.3516964912414551, + "kl": 0.0027946206682827324, + "learning_rate": 3.990758225898397e-05, + "loss": 0.0001, + "num_tokens": 96124.0, + "reward": 45.71698760986328, + "reward_std": 65.61055755615234, + "rewards/keyword_presence_reward/mean": 0.375, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 9.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.008166216313838959, + "rewards/sentence_structure_reward/std": 0.0021410335320979357, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0005395845199196619, + "grad_norm": 0.3330807387828827, + "kl": 0.004737653536722064, + "learning_rate": 3.989486031532915e-05, + "loss": 0.0002, + "num_tokens": 99620.0, + "reward": 91.081787109375, + "reward_std": 60.293983459472656, + "rewards/keyword_presence_reward/mean": 0.625, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 18.75, + "rewards/reward_keyword_presence/std": 17.677669525146484, + "rewards/sentence_structure_reward/mean": 0.006686875130981207, + "rewards/sentence_structure_reward/std": 0.0018701679073274136, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 256.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 239.375, + "completions/mean_terminated_length": 189.5, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.0005595691317685382, + "grad_norm": 0.3370240032672882, + "kl": 0.0037777119432576, + "learning_rate": 3.988132057188961e-05, + "loss": 0.0002, + "num_tokens": 102543.0, + "reward": 76.1638412475586, + "reward_std": 65.60563659667969, + "rewards/keyword_presence_reward/mean": 0.625, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 15.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.007599936798214912, + "rewards/sentence_structure_reward/std": 0.0029245859477669, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 256.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 246.125, + "completions/mean_terminated_length": 216.5, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.0005795537436174146, + "grad_norm": 0.3176335394382477, + "kl": 0.006047982140444219, + "learning_rate": 3.9866963585231165e-05, + "loss": 0.0002, + "num_tokens": 105968.0, + "reward": 30.49212646484375, + "reward_std": 60.893898010253906, + "rewards/keyword_presence_reward/mean": 0.25, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 6.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.008172232657670975, + "rewards/sentence_structure_reward/std": 0.0021524433977901936, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.000599538355466291, + "grad_norm": 0.32324549555778503, + "kl": 0.0027866671443916857, + "learning_rate": 3.98517899455133e-05, + "loss": 0.0001, + "num_tokens": 110504.0, + "reward": 60.635536193847656, + "reward_std": 95.454833984375, + "rewards/keyword_presence_reward/mean": 0.375, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 12.5, + "rewards/reward_keyword_presence/std": 18.898223876953125, + "rewards/sentence_structure_reward/mean": 0.007370438892394304, + "rewards/sentence_structure_reward/std": 0.0017719126772135496, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0006195229673151674, + "grad_norm": 0.2613885998725891, + "kl": 0.002322520042071119, + "learning_rate": 3.983580027646492e-05, + "loss": 0.0001, + "num_tokens": 115112.0, + "reward": 106.31047058105469, + "reward_std": 65.00648498535156, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.0074193538166582584, + "rewards/sentence_structure_reward/std": 0.002351284259930253, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 247.0, + "completions/mean_terminated_length": 184.0, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.0006395075791640436, + "grad_norm": 7.667415142059326, + "kl": 0.0053951369191054255, + "learning_rate": 3.98189952353587e-05, + "loss": 0.0002, + "num_tokens": 120188.0, + "reward": 45.41682434082031, + "reward_std": 90.74246215820312, + "rewards/keyword_presence_reward/mean": 0.25, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 9.375, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.008563091978430748, + "rewards/sentence_structure_reward/std": 0.001968884840607643, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.00065949219101292, + "grad_norm": 0.2934196889400482, + "kl": 0.0058088768855668604, + "learning_rate": 3.9801375512984044e-05, + "loss": 0.0002, + "num_tokens": 124116.0, + "reward": 15.258341789245605, + "reward_std": 30.455270767211914, + "rewards/keyword_presence_reward/mean": 0.125, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 3.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.006455921567976475, + "rewards/sentence_structure_reward/std": 0.0015936228446662426, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0006794768028617964, + "grad_norm": 0.385358989238739, + "kl": 0.007211260846816003, + "learning_rate": 3.9782941833618724e-05, + "loss": 0.0003, + "num_tokens": 127728.0, + "reward": 60.93626403808594, + "reward_std": 60.899070739746094, + "rewards/keyword_presence_reward/mean": 0.5, + "rewards/keyword_presence_reward/std": 0.5345224738121033, + "rewards/reward_keyword_presence/mean": 12.5, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.007081340067088604, + "rewards/sentence_structure_reward/std": 0.0021946935448795557, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 247.625, + "completions/mean_terminated_length": 189.0, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.0006994614147106728, + "grad_norm": 0.36085864901542664, + "kl": 0.00623788678785786, + "learning_rate": 3.976369495499912e-05, + "loss": 0.0002, + "num_tokens": 131637.0, + "reward": 60.93710708618164, + "reward_std": 60.89747619628906, + "rewards/keyword_presence_reward/mean": 0.5, + "rewards/keyword_presence_reward/std": 0.5345224738121033, + "rewards/reward_keyword_presence/mean": 12.5, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.007244637235999107, + "rewards/sentence_structure_reward/std": 0.0013477250467985868, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 256.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 253.125, + "completions/mean_terminated_length": 244.5, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.0007194460265595492, + "grad_norm": 0.3804939389228821, + "kl": 0.008292036422062665, + "learning_rate": 3.974363566828901e-05, + "loss": 0.0003, + "num_tokens": 134762.0, + "reward": 60.932899475097656, + "reward_std": 70.31549072265625, + "rewards/keyword_presence_reward/mean": 0.5, + "rewards/keyword_presence_reward/std": 0.5345224738121033, + "rewards/reward_keyword_presence/mean": 12.5, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.006432830356061459, + "rewards/sentence_structure_reward/std": 0.0019866900984197855, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0007394306384084255, + "grad_norm": 0.2606145143508911, + "kl": 0.006737934047123417, + "learning_rate": 3.97227647980471e-05, + "loss": 0.0003, + "num_tokens": 138274.0, + "reward": 45.702064514160156, + "reward_std": 30.458818435668945, + "rewards/keyword_presence_reward/mean": 0.375, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 9.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.005284905433654785, + "rewards/sentence_structure_reward/std": 0.0022680331021547318, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0007594152502573019, + "grad_norm": 0.41060540080070496, + "kl": 0.006080534745706245, + "learning_rate": 3.970108320219314e-05, + "loss": 0.0002, + "num_tokens": 142078.0, + "reward": 105.7036361694336, + "reward_std": 90.74586486816406, + "rewards/keyword_presence_reward/mean": 0.5, + "rewards/keyword_presence_reward/std": 0.5345224738121033, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 24.775779724121094, + "rewards/sentence_structure_reward/mean": 0.006957275792956352, + "rewards/sentence_structure_reward/std": 0.0013196211075410247, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0007793998621061783, + "grad_norm": 0.5079838037490845, + "kl": 0.010547310870606452, + "learning_rate": 3.96785917719726e-05, + "loss": 0.0004, + "num_tokens": 145738.0, + "reward": 91.07868957519531, + "reward_std": 84.39026641845703, + "rewards/keyword_presence_reward/mean": 0.625, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 18.75, + "rewards/reward_keyword_presence/std": 17.677669525146484, + "rewards/sentence_structure_reward/mean": 0.006089614704251289, + "rewards/sentence_structure_reward/std": 0.001248899265192449, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 245.0, + "completions/mean_terminated_length": 168.0, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.0007993844739550547, + "grad_norm": 0.40107324719429016, + "kl": 0.011110408697277308, + "learning_rate": 3.965529143192008e-05, + "loss": 0.0004, + "num_tokens": 148754.0, + "reward": 45.70843505859375, + "reward_std": 65.61022186279297, + "rewards/keyword_presence_reward/mean": 0.375, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 9.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.006515481509268284, + "rewards/sentence_structure_reward/std": 0.001049907528795302, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0008193690858039309, + "grad_norm": 0.36239564418792725, + "kl": 0.006966580462176353, + "learning_rate": 3.963118313982131e-05, + "loss": 0.0003, + "num_tokens": 152554.0, + "reward": 45.71627426147461, + "reward_std": 65.61026763916016, + "rewards/keyword_presence_reward/mean": 0.375, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 9.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.008028284646570683, + "rewards/sentence_structure_reward/std": 0.0034574747551232576, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0008393536976528073, + "grad_norm": 0.4240066707134247, + "kl": 0.020611065963748842, + "learning_rate": 3.960626788667375e-05, + "loss": 0.0008, + "num_tokens": 156306.0, + "reward": 15.256549835205078, + "reward_std": 30.454877853393555, + "rewards/keyword_presence_reward/mean": 0.125, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 3.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.006109886337071657, + "rewards/sentence_structure_reward/std": 0.0012247450649738312, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0008593383095016837, + "grad_norm": 0.3700573146343231, + "kl": 0.009186381532344967, + "learning_rate": 3.958054669664586e-05, + "loss": 0.0004, + "num_tokens": 160434.0, + "reward": 76.16595458984375, + "reward_std": 65.61014556884766, + "rewards/keyword_presence_reward/mean": 0.625, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 15.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.008008204400539398, + "rewards/sentence_structure_reward/std": 0.0013299931306391954, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 82.0, + "completions/mean_length": 234.25, + "completions/mean_terminated_length": 82.0, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.0008793229213505601, + "grad_norm": 0.5953694581985474, + "kl": 0.01785224862396717, + "learning_rate": 3.9554020627035034e-05, + "loss": 0.0007, + "num_tokens": 164236.0, + "reward": 45.41664505004883, + "reward_std": 57.801788330078125, + "rewards/keyword_presence_reward/mean": 0.25, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 9.375, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.008528131060302258, + "rewards/sentence_structure_reward/std": 0.004299451131373644, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 256.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 234.125, + "completions/mean_terminated_length": 168.5, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.0008993075331994364, + "grad_norm": 0.4540475308895111, + "kl": 0.011944298690650612, + "learning_rate": 3.952669076822409e-05, + "loss": 0.0005, + "num_tokens": 167277.0, + "reward": 106.30879211425781, + "reward_std": 65.00543212890625, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.007095631677657366, + "rewards/sentence_structure_reward/std": 0.0018645260715857148, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0009192921450483128, + "grad_norm": 0.3490599989891052, + "kl": 0.007468605879694223, + "learning_rate": 3.949855824363647e-05, + "loss": 0.0003, + "num_tokens": 172733.0, + "reward": 45.73394775390625, + "reward_std": 65.62181091308594, + "rewards/keyword_presence_reward/mean": 0.375, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 9.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.011439412832260132, + "rewards/sentence_structure_reward/std": 0.009165159426629543, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0009392767568971892, + "grad_norm": 0.37753745913505554, + "kl": 0.013447662611724809, + "learning_rate": 3.94696242096901e-05, + "loss": 0.0005, + "num_tokens": 176385.0, + "reward": 75.85598754882812, + "reward_std": 90.74575805664062, + "rewards/keyword_presence_reward/mean": 0.5, + "rewards/keyword_presence_reward/std": 0.5345224738121033, + "rewards/reward_keyword_presence/mean": 15.625, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.006512562744319439, + "rewards/sentence_structure_reward/std": 0.001255925977602601, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0009592613687460656, + "grad_norm": 0.3506099581718445, + "kl": 0.010885529103688896, + "learning_rate": 3.943988985574976e-05, + "loss": 0.0004, + "num_tokens": 180449.0, + "reward": 106.30877685546875, + "reward_std": 79.68098449707031, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.007092979270964861, + "rewards/sentence_structure_reward/std": 0.0025537435431033373, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 256.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 202.125, + "completions/mean_terminated_length": 40.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.000979245980594942, + "grad_norm": 0.8308688998222351, + "kl": 0.021679759374819696, + "learning_rate": 3.9409356404078296e-05, + "loss": 0.0009, + "num_tokens": 183678.0, + "reward": 76.1637954711914, + "reward_std": 65.59480285644531, + "rewards/keyword_presence_reward/mean": 0.625, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 15.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.007591024041175842, + "rewards/sentence_structure_reward/std": 0.00554612884297967, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 238.5, + "completions/mean_terminated_length": 116.0, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.0009992305924438182, + "grad_norm": 0.5704435110092163, + "kl": 0.02520259830635041, + "learning_rate": 3.937802510978631e-05, + "loss": 0.001, + "num_tokens": 186986.0, + "reward": 91.38750457763672, + "reward_std": 35.173248291015625, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 18.75, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.007362786680459976, + "rewards/sentence_structure_reward/std": 0.002714392263442278, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0010192152042926947, + "grad_norm": 0.3701224625110626, + "kl": 0.01107728574424982, + "learning_rate": 3.934589726078059e-05, + "loss": 0.0, + "num_tokens": 190278.0, + "reward": 106.60865783691406, + "reward_std": 30.4531307220459, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.006641001906245947, + "rewards/sentence_structure_reward/std": 0.0013526929542422295, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 230.125, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.001039199816141571, + "grad_norm": 0.587343156337738, + "kl": 0.04292119946330786, + "learning_rate": 3.931297417771118e-05, + "loss": 0.0017, + "num_tokens": 194019.0, + "reward": 75.8651351928711, + "reward_std": 79.67740631103516, + "rewards/keyword_presence_reward/mean": 0.5, + "rewards/keyword_presence_reward/std": 0.5345224738121033, + "rewards/reward_keyword_presence/mean": 15.625, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.008278170600533485, + "rewards/sentence_structure_reward/std": 0.007953895255923271, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 228.625, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.0010591844279904475, + "grad_norm": 2.716738224029541, + "kl": 0.02232059364905581, + "learning_rate": 3.927925721391707e-05, + "loss": 0.0009, + "num_tokens": 197796.0, + "reward": 91.4142837524414, + "reward_std": 60.87089538574219, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 18.75, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.012531505897641182, + "rewards/sentence_structure_reward/std": 0.008540182374417782, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0010791690398393237, + "grad_norm": 0.5389519333839417, + "kl": 0.010715900105424225, + "learning_rate": 3.924474775537058e-05, + "loss": 0.0004, + "num_tokens": 202016.0, + "reward": 121.24063110351562, + "reward_std": 69.62336730957031, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 18.898223876953125, + "rewards/sentence_structure_reward/mean": 0.008864352479577065, + "rewards/sentence_structure_reward/std": 0.0018129952950403094, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0010991536516882, + "grad_norm": 0.36584925651550293, + "kl": 0.02433936600573361, + "learning_rate": 3.920944722062039e-05, + "loss": 0.0003, + "num_tokens": 205608.0, + "reward": 136.75421142578125, + "reward_std": 29.845521926879883, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.006254056468605995, + "rewards/sentence_structure_reward/std": 0.001142276218160987, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 225.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.0011191382635370765, + "grad_norm": 1.4970893859863281, + "kl": 0.016215094714425504, + "learning_rate": 3.9173357060733213e-05, + "loss": 0.0006, + "num_tokens": 209068.0, + "reward": 60.62938690185547, + "reward_std": 88.23365783691406, + "rewards/keyword_presence_reward/mean": 0.375, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 12.5, + "rewards/reward_keyword_presence/std": 18.898223876953125, + "rewards/sentence_structure_reward/mean": 0.006183166988193989, + "rewards/sentence_structure_reward/std": 0.0029645839240401983, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 234.125, + "completions/mean_terminated_length": 81.0, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.0011391228753859528, + "grad_norm": 0.48272788524627686, + "kl": 0.017312932992354035, + "learning_rate": 3.913647875923418e-05, + "loss": 0.0008, + "num_tokens": 212661.0, + "reward": 121.5530014038086, + "reward_std": 49.213661193847656, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.010823261924088001, + "rewards/sentence_structure_reward/std": 0.010416340082883835, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0011591074872348292, + "grad_norm": 0.35397249460220337, + "kl": 0.012197267962619662, + "learning_rate": 3.909881383204581e-05, + "loss": -0.0001, + "num_tokens": 216509.0, + "reward": 121.53849792480469, + "reward_std": 49.23612976074219, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.00802415981888771, + "rewards/sentence_structure_reward/std": 0.0007656202069483697, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0011790920990837055, + "grad_norm": 0.26875412464141846, + "kl": 0.008053294412093237, + "learning_rate": 3.906036382742575e-05, + "loss": 0.0003, + "num_tokens": 220821.0, + "reward": 45.92039489746094, + "reward_std": 65.35962677001953, + "rewards/keyword_presence_reward/mean": 0.375, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 9.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.047424208372831345, + "rewards/sentence_structure_reward/std": 0.11557060480117798, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.001199076710932582, + "grad_norm": 0.3850199580192566, + "kl": 0.021677418728359044, + "learning_rate": 3.9021130325903076e-05, + "loss": 0.0004, + "num_tokens": 224865.0, + "reward": 106.61028289794922, + "reward_std": 30.45132064819336, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.006954835262149572, + "rewards/sentence_structure_reward/std": 0.0006638877675868571, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 252.375, + "completions/mean_terminated_length": 227.0, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.0012190613227814583, + "grad_norm": 1.6747939586639404, + "kl": 0.01351847525802441, + "learning_rate": 3.898111494021338e-05, + "loss": 0.0005, + "num_tokens": 228392.0, + "reward": 121.53634643554688, + "reward_std": 60.28807830810547, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.0076080732978880405, + "rewards/sentence_structure_reward/std": 0.0013854404678568244, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0012390459346303347, + "grad_norm": 0.34143251180648804, + "kl": 0.017615517252124846, + "learning_rate": 3.894031931523243e-05, + "loss": 0.0008, + "num_tokens": 232028.0, + "reward": 136.76051330566406, + "reward_std": 29.852630615234375, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.007469276897609234, + "rewards/sentence_structure_reward/std": 0.0026290693785995245, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.001259030546479211, + "grad_norm": 0.33655115962028503, + "kl": 0.01813864556606859, + "learning_rate": 3.8898745127908586e-05, + "loss": 0.0007, + "num_tokens": 235568.0, + "reward": 106.31483459472656, + "reward_std": 65.00259399414062, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.00826185941696167, + "rewards/sentence_structure_reward/std": 0.003564495826140046, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0012790151583280873, + "grad_norm": 0.3403065800666809, + "kl": 0.02017145900754258, + "learning_rate": 3.885639408719386e-05, + "loss": 0.0008, + "num_tokens": 239056.0, + "reward": 106.30546569824219, + "reward_std": 79.67439270019531, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.006453284062445164, + "rewards/sentence_structure_reward/std": 0.0014421244850382209, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0012989997701769638, + "grad_norm": 0.3347587585449219, + "kl": 0.011241376341786236, + "learning_rate": 3.8813267933973655e-05, + "loss": 0.0004, + "num_tokens": 243312.0, + "reward": 91.09044647216797, + "reward_std": 84.3927001953125, + "rewards/keyword_presence_reward/mean": 0.625, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 18.75, + "rewards/reward_keyword_presence/std": 17.677669525146484, + "rewards/sentence_structure_reward/mean": 0.008358591236174107, + "rewards/sentence_structure_reward/std": 0.0020757517777383327, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.00131898438202584, + "grad_norm": 0.31740280985832214, + "kl": 0.030262008076533675, + "learning_rate": 3.876936844099521e-05, + "loss": 0.0014, + "num_tokens": 246920.0, + "reward": 136.76296997070312, + "reward_std": 29.850709915161133, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.00794486328959465, + "rewards/sentence_structure_reward/std": 0.002007469767704606, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 238.625, + "completions/mean_terminated_length": 117.0, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.0013389689938747165, + "grad_norm": 0.37221434712409973, + "kl": 0.027339605963788927, + "learning_rate": 3.872469741279475e-05, + "loss": 0.0016, + "num_tokens": 250073.0, + "reward": 121.83104705810547, + "reward_std": 0.011327862739562988, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.006157414987683296, + "rewards/sentence_structure_reward/std": 0.002675427123904228, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0013589536057235928, + "grad_norm": 0.4223702847957611, + "kl": 0.025570991449058056, + "learning_rate": 3.867925668562327e-05, + "loss": 0.001, + "num_tokens": 253157.0, + "reward": 106.61027526855469, + "reward_std": 30.45111083984375, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.006953577045351267, + "rewards/sentence_structure_reward/std": 0.0013567639980465174, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0013789382175724693, + "grad_norm": 0.626150369644165, + "kl": 0.01992089638952166, + "learning_rate": 3.863304812737109e-05, + "loss": 0.0008, + "num_tokens": 256545.0, + "reward": 106.31869506835938, + "reward_std": 79.67942810058594, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.009007125161588192, + "rewards/sentence_structure_reward/std": 0.0035292862448841333, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0013989228294213456, + "grad_norm": 0.3755885362625122, + "kl": 0.03095463989302516, + "learning_rate": 3.858607363749104e-05, + "loss": 0.0014, + "num_tokens": 259541.0, + "reward": 106.61100006103516, + "reward_std": 30.450210571289062, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.007092923857271671, + "rewards/sentence_structure_reward/std": 0.0008285974618047476, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.001418907441270222, + "grad_norm": 0.404275506734848, + "kl": 0.024192957091145217, + "learning_rate": 3.853833514692044e-05, + "loss": 0.001, + "num_tokens": 263497.0, + "reward": 91.09453582763672, + "reward_std": 84.39169311523438, + "rewards/keyword_presence_reward/mean": 0.625, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 18.75, + "rewards/reward_keyword_presence/std": 17.677669525146484, + "rewards/sentence_structure_reward/mean": 0.009147986769676208, + "rewards/sentence_structure_reward/std": 0.0035741496831178665, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0014388920531190983, + "grad_norm": 0.4358273148536682, + "kl": 0.033832408022135496, + "learning_rate": 3.8489834618001633e-05, + "loss": 0.0005, + "num_tokens": 266809.0, + "reward": 136.7550811767578, + "reward_std": 29.846294403076172, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.006419440731406212, + "rewards/sentence_structure_reward/std": 0.0006010282668285072, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 227.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.0014588766649679746, + "grad_norm": 1.328521490097046, + "kl": 0.12083407444879413, + "learning_rate": 3.8440574044401414e-05, + "loss": 0.0048, + "num_tokens": 270401.0, + "reward": 121.56289672851562, + "reward_std": 60.24174499511719, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.012733520939946175, + "rewards/sentence_structure_reward/std": 0.014157221652567387, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.001478861276816851, + "grad_norm": 0.337324321269989, + "kl": 0.031076728366315365, + "learning_rate": 3.839055545102902e-05, + "loss": 0.0015, + "num_tokens": 274477.0, + "reward": 121.53665161132812, + "reward_std": 49.23242950439453, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.0076678721234202385, + "rewards/sentence_structure_reward/std": 0.0022746494505554438, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 256.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 213.625, + "completions/mean_terminated_length": 86.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0014988458886657273, + "grad_norm": 10.067192077636719, + "kl": 0.1808934120927006, + "learning_rate": 3.833978089395291e-05, + "loss": 0.0072, + "num_tokens": 277402.0, + "reward": 136.46121215820312, + "reward_std": 64.92233276367188, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.008032754994928837, + "rewards/sentence_structure_reward/std": 0.004551195539534092, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0015188305005146038, + "grad_norm": 0.38250431418418884, + "kl": 0.0500550550641492, + "learning_rate": 3.828825246031625e-05, + "loss": 0.0022, + "num_tokens": 281958.0, + "reward": 136.758056640625, + "reward_std": 29.849376678466797, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.0069946106523275375, + "rewards/sentence_structure_reward/std": 0.00103515456430614, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.00153881511236348, + "grad_norm": 0.42378926277160645, + "kl": 0.04328863904811442, + "learning_rate": 3.823597226825114e-05, + "loss": 0.0019, + "num_tokens": 285298.0, + "reward": 136.7613067626953, + "reward_std": 29.84734535217285, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.00762174092233181, + "rewards/sentence_structure_reward/std": 0.0022488809190690517, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0015587997242123566, + "grad_norm": 0.3983001410961151, + "kl": 0.04753530281595886, + "learning_rate": 3.81829424667915e-05, + "loss": 0.0021, + "num_tokens": 289162.0, + "reward": 121.83740234375, + "reward_std": 0.0090835802257061, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.007384970784187317, + "rewards/sentence_structure_reward/std": 0.0019522467628121376, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0015787843360612328, + "grad_norm": 0.3571934700012207, + "kl": 0.05808237474411726, + "learning_rate": 3.8129165235784765e-05, + "loss": 0.0032, + "num_tokens": 292746.0, + "reward": 121.8349380493164, + "reward_std": 0.006901816464960575, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.006907949224114418, + "rewards/sentence_structure_reward/std": 0.001335795153863728, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0015987689479101093, + "grad_norm": 0.35538768768310547, + "kl": 0.048818540992215276, + "learning_rate": 3.807464278580227e-05, + "loss": 0.0016, + "num_tokens": 295914.0, + "reward": 136.75669860839844, + "reward_std": 29.84320640563965, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.00673374067991972, + "rewards/sentence_structure_reward/std": 0.001154298079200089, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0016187535597589856, + "grad_norm": 0.3952377438545227, + "kl": 0.043082613963633776, + "learning_rate": 3.801937735804838e-05, + "loss": 0.0017, + "num_tokens": 299594.0, + "reward": 151.68759155273438, + "reward_std": 59.68013000488281, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.008321144618093967, + "rewards/sentence_structure_reward/std": 0.0027383428532630205, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0016387381716078619, + "grad_norm": 0.364397794008255, + "kl": 0.036351481918245554, + "learning_rate": 3.796337122426838e-05, + "loss": 0.0015, + "num_tokens": 302702.0, + "reward": 151.67819213867188, + "reward_std": 59.69171905517578, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.006504442077130079, + "rewards/sentence_structure_reward/std": 0.0006916808197274804, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0016587227834567383, + "grad_norm": 0.37290436029434204, + "kl": 0.04909397638402879, + "learning_rate": 3.790662668665506e-05, + "loss": 0.0027, + "num_tokens": 305806.0, + "reward": 136.75299072265625, + "reward_std": 29.847497940063477, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.006016939878463745, + "rewards/sentence_structure_reward/std": 0.0028947163373231888, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0016787073953056146, + "grad_norm": 0.8915900588035583, + "kl": 0.06582449516281486, + "learning_rate": 3.7849146077754124e-05, + "loss": 0.0025, + "num_tokens": 309526.0, + "reward": 121.83523559570312, + "reward_std": 0.009015304036438465, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.006967578548938036, + "rewards/sentence_structure_reward/std": 0.0016708620823919773, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.001698692007154491, + "grad_norm": 0.390764445066452, + "kl": 0.05409643822349608, + "learning_rate": 3.779093176036824e-05, + "loss": 0.0024, + "num_tokens": 313394.0, + "reward": 136.75311279296875, + "reward_std": 29.858076095581055, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.0060386499390006065, + "rewards/sentence_structure_reward/std": 0.0038903753738850355, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0017186766190033674, + "grad_norm": 0.40260064601898193, + "kl": 0.054133959114551544, + "learning_rate": 3.7731986127460006e-05, + "loss": 0.0021, + "num_tokens": 316378.0, + "reward": 121.83267974853516, + "reward_std": 0.012771755456924438, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.0064735133200883865, + "rewards/sentence_structure_reward/std": 0.0027301658410578966, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0017386612308522439, + "grad_norm": 0.3572516441345215, + "kl": 0.05867433222010732, + "learning_rate": 3.767231160205351e-05, + "loss": 0.002, + "num_tokens": 320510.0, + "reward": 166.59934997558594, + "reward_std": 29.858739852905273, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.006213083863258362, + "rewards/sentence_structure_reward/std": 0.002774558262899518, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0017586458427011201, + "grad_norm": 0.45258814096450806, + "kl": 0.041663731914013624, + "learning_rate": 3.761191063713476e-05, + "loss": 0.0017, + "num_tokens": 324182.0, + "reward": 121.52487182617188, + "reward_std": 60.308448791503906, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.0053944275714457035, + "rewards/sentence_structure_reward/std": 0.0051556420512497425, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0017786304545499964, + "grad_norm": 0.4034987986087799, + "kl": 0.05180245125666261, + "learning_rate": 3.755078571555086e-05, + "loss": 0.0021, + "num_tokens": 327366.0, + "reward": 136.75120544433594, + "reward_std": 29.85380744934082, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.00567284133285284, + "rewards/sentence_structure_reward/std": 0.0024452214129269123, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0017986150663988729, + "grad_norm": 0.47377070784568787, + "kl": 0.04147388809360564, + "learning_rate": 3.7488939349907914e-05, + "loss": 0.0017, + "num_tokens": 330982.0, + "reward": 121.54263305664062, + "reward_std": 60.29094696044922, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.008822030387818813, + "rewards/sentence_structure_reward/std": 0.00323826028034091, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0018185996782477492, + "grad_norm": 0.39336612820625305, + "kl": 0.041609878186136484, + "learning_rate": 3.742637408246779e-05, + "loss": 0.0017, + "num_tokens": 334922.0, + "reward": 106.30340576171875, + "reward_std": 65.01343536376953, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.006055374164134264, + "rewards/sentence_structure_reward/std": 0.0033748450223356485, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0018385842900966256, + "grad_norm": 0.37697914242744446, + "kl": 0.04276644508354366, + "learning_rate": 3.736309248504357e-05, + "loss": 0.0004, + "num_tokens": 338910.0, + "reward": 181.5271453857422, + "reward_std": 0.008044651709496975, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.007202915847301483, + "rewards/sentence_structure_reward/std": 0.0015763341216370463, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.001858568901945502, + "grad_norm": 0.2903560996055603, + "kl": 0.04065780679229647, + "learning_rate": 3.7299097158893876e-05, + "loss": 0.0016, + "num_tokens": 342838.0, + "reward": 106.30763244628906, + "reward_std": 79.67791748046875, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.006871088407933712, + "rewards/sentence_structure_reward/std": 0.0019685053266584873, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0018785535137943784, + "grad_norm": 0.3594500422477722, + "kl": 0.04092976520769298, + "learning_rate": 3.72343907346159e-05, + "loss": 0.002, + "num_tokens": 345706.0, + "reward": 136.7524871826172, + "reward_std": 29.850196838378906, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.005918572656810284, + "rewards/sentence_structure_reward/std": 0.0025044798385351896, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0018985381256432547, + "grad_norm": 0.3608114421367645, + "kl": 0.04396634071599692, + "learning_rate": 3.716897587203733e-05, + "loss": 0.0018, + "num_tokens": 349474.0, + "reward": 196.14869689941406, + "reward_std": 64.90634155273438, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 26.51650619506836, + "rewards/sentence_structure_reward/mean": 0.007412685081362724, + "rewards/sentence_structure_reward/std": 0.0015448002377524972, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0019185227374921311, + "grad_norm": 0.35034164786338806, + "kl": 0.03378243022598326, + "learning_rate": 3.710285526010693e-05, + "loss": -0.0001, + "num_tokens": 352482.0, + "reward": 136.45462036132812, + "reward_std": 57.67726516723633, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.006759897340089083, + "rewards/sentence_structure_reward/std": 0.0008004506235010922, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0019385073493410074, + "grad_norm": 0.40918344259262085, + "kl": 0.0453730917070061, + "learning_rate": 3.703603161678409e-05, + "loss": 0.0019, + "num_tokens": 356134.0, + "reward": 121.83447265625, + "reward_std": 0.006286046467721462, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.006819131318479776, + "rewards/sentence_structure_reward/std": 0.0011775894090533257, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.001958491961189884, + "grad_norm": 0.31769728660583496, + "kl": 0.04801344173029065, + "learning_rate": 3.6968507688927054e-05, + "loss": 0.0022, + "num_tokens": 359498.0, + "reward": 136.7637176513672, + "reward_std": 29.844411849975586, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.008086403831839561, + "rewards/sentence_structure_reward/std": 0.000998539151623845, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.00197847657303876, + "grad_norm": 0.3809857964515686, + "kl": 0.0434157932177186, + "learning_rate": 3.690028625218003e-05, + "loss": 0.0017, + "num_tokens": 362774.0, + "reward": 106.60781860351562, + "reward_std": 30.452434539794922, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.0064794463105499744, + "rewards/sentence_structure_reward/std": 0.0007346008787862957, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0019984611848876364, + "grad_norm": 0.4663333296775818, + "kl": 0.040989194763824344, + "learning_rate": 3.683137011085907e-05, + "loss": 0.0025, + "num_tokens": 366914.0, + "reward": 166.3039093017578, + "reward_std": 30.454692840576172, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.007519709412008524, + "rewards/sentence_structure_reward/std": 0.0010422584600746632, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002018445796736513, + "grad_norm": 0.3202873170375824, + "kl": 0.052380071952939034, + "learning_rate": 3.676176209783681e-05, + "loss": 0.0019, + "num_tokens": 370554.0, + "reward": 121.83412170410156, + "reward_std": 0.016215551644563675, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.006751572713255882, + "rewards/sentence_structure_reward/std": 0.003325974103063345, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0020384304085853894, + "grad_norm": 0.33610251545906067, + "kl": 0.03472286823671311, + "learning_rate": 3.669146507442606e-05, + "loss": 0.0014, + "num_tokens": 374542.0, + "reward": 136.45980834960938, + "reward_std": 79.07533264160156, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.007760894019156694, + "rewards/sentence_structure_reward/std": 0.0013563705142587423, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0020584150204342655, + "grad_norm": 0.4171355068683624, + "kl": 0.03935318277217448, + "learning_rate": 3.662048193026208e-05, + "loss": 0.0016, + "num_tokens": 378006.0, + "reward": 181.52633666992188, + "reward_std": 59.6912841796875, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.007044975645840168, + "rewards/sentence_structure_reward/std": 0.0014007376739755273, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002078399632283142, + "grad_norm": 0.5896279215812683, + "kl": 0.05653572571463883, + "learning_rate": 3.654881558318393e-05, + "loss": 0.0025, + "num_tokens": 381586.0, + "reward": 136.7564697265625, + "reward_std": 29.849140167236328, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.006686228793114424, + "rewards/sentence_structure_reward/std": 0.001666005584411323, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0020983842441320184, + "grad_norm": 0.45639991760253906, + "kl": 0.034765179734677076, + "learning_rate": 3.6476468979114435e-05, + "loss": 0.0014, + "num_tokens": 385530.0, + "reward": 181.5338592529297, + "reward_std": 59.690555572509766, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.008497844450175762, + "rewards/sentence_structure_reward/std": 0.005464548710733652, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002118368855980895, + "grad_norm": 0.33767175674438477, + "kl": 0.04220978089142591, + "learning_rate": 3.640344509193912e-05, + "loss": 0.0017, + "num_tokens": 390026.0, + "reward": 121.54151916503906, + "reward_std": 60.305503845214844, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.008607689291238785, + "rewards/sentence_structure_reward/std": 0.0035009433049708605, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002138353467829771, + "grad_norm": 0.3810741901397705, + "kl": 0.045649494510143995, + "learning_rate": 3.632974692338397e-05, + "loss": 0.0022, + "num_tokens": 393382.0, + "reward": 121.83490753173828, + "reward_std": 0.006655906327068806, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.006902582012116909, + "rewards/sentence_structure_reward/std": 0.0012269754661247134, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0021583380796786475, + "grad_norm": 1.125035047531128, + "kl": 0.03972491808235645, + "learning_rate": 3.6255377502892055e-05, + "loss": 0.0017, + "num_tokens": 397002.0, + "reward": 166.6158447265625, + "reward_std": 29.8544921875, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.009395826607942581, + "rewards/sentence_structure_reward/std": 0.0029701110906898975, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002178322691527524, + "grad_norm": 0.3584715723991394, + "kl": 0.03357674542348832, + "learning_rate": 3.6180339887498953e-05, + "loss": 0.0013, + "num_tokens": 401494.0, + "reward": 121.53976440429688, + "reward_std": 60.29409408569336, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.008269078098237514, + "rewards/sentence_structure_reward/std": 0.001399196102283895, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0021983073033764, + "grad_norm": 0.37935322523117065, + "kl": 0.048895415384322405, + "learning_rate": 3.610463716170713e-05, + "loss": 0.002, + "num_tokens": 405838.0, + "reward": 181.23211669921875, + "reward_std": 97.963134765625, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 23.1455020904541, + "rewards/sentence_structure_reward/mean": 0.008588863536715508, + "rewards/sentence_structure_reward/std": 0.0014587591867893934, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0022182919152252765, + "grad_norm": 0.37604331970214844, + "kl": 0.08323597349226475, + "learning_rate": 3.602827243735913e-05, + "loss": 0.0033, + "num_tokens": 409758.0, + "reward": 151.6850128173828, + "reward_std": 34.47623825073242, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.007821301929652691, + "rewards/sentence_structure_reward/std": 0.0037995565216988325, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002238276527074153, + "grad_norm": 0.31912916898727417, + "kl": 0.04227019380778074, + "learning_rate": 3.5951248853509693e-05, + "loss": 0.002, + "num_tokens": 413426.0, + "reward": 136.76470947265625, + "reward_std": 29.846139907836914, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.008278263732790947, + "rewards/sentence_structure_reward/std": 0.0008149361819960177, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0022582611389230295, + "grad_norm": 0.34654438495635986, + "kl": 0.05677773617208004, + "learning_rate": 3.587356957629666e-05, + "loss": 0.0023, + "num_tokens": 416778.0, + "reward": 151.67874145507812, + "reward_std": 34.467952728271484, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.006609901785850525, + "rewards/sentence_structure_reward/std": 0.0029327182564884424, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0022782457507719055, + "grad_norm": 0.3314315378665924, + "kl": 0.07482986524701118, + "learning_rate": 3.579523779881085e-05, + "loss": 0.0031, + "num_tokens": 420802.0, + "reward": 136.75234985351562, + "reward_std": 29.860536575317383, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.005891215987503529, + "rewards/sentence_structure_reward/std": 0.004088117741048336, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002298230362620782, + "grad_norm": 0.347928524017334, + "kl": 0.042590694152750075, + "learning_rate": 3.5716256740964854e-05, + "loss": 0.0021, + "num_tokens": 424058.0, + "reward": 121.84147644042969, + "reward_std": 0.02900577522814274, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.008171314373612404, + "rewards/sentence_structure_reward/std": 0.006810889113694429, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0023182149744696585, + "grad_norm": 0.3275226056575775, + "kl": 0.03519013000186533, + "learning_rate": 3.56366296493606e-05, + "loss": 0.0014, + "num_tokens": 428866.0, + "reward": 151.6896514892578, + "reward_std": 34.472190856933594, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.008716396056115627, + "rewards/sentence_structure_reward/std": 0.0042094020172953606, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0023381995863185345, + "grad_norm": 0.32084059715270996, + "kl": 0.0395226429682225, + "learning_rate": 3.5556359797155946e-05, + "loss": 0.0016, + "num_tokens": 432650.0, + "reward": 151.70523071289062, + "reward_std": 34.464447021484375, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.011721793562173843, + "rewards/sentence_structure_reward/std": 0.0018708258867263794, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002358184198167411, + "grad_norm": 0.4576303958892822, + "kl": 0.06924406997859478, + "learning_rate": 3.5475450483930136e-05, + "loss": 0.0028, + "num_tokens": 436718.0, + "reward": 151.6864776611328, + "reward_std": 59.68297576904297, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.008104050531983376, + "rewards/sentence_structure_reward/std": 0.003332611406221986, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0023781688100162875, + "grad_norm": 0.3538784384727478, + "kl": 0.044660965679213405, + "learning_rate": 3.539390503554812e-05, + "loss": 0.0016, + "num_tokens": 440174.0, + "reward": 121.84540557861328, + "reward_std": 0.019820302724838257, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.008929690346121788, + "rewards/sentence_structure_reward/std": 0.0043133217841386795, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002398153421865164, + "grad_norm": 0.342343270778656, + "kl": 0.04530683264601976, + "learning_rate": 3.5311726804023894e-05, + "loss": 0.0012, + "num_tokens": 444494.0, + "reward": 121.85669708251953, + "reward_std": 0.013227737508714199, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.011109627783298492, + "rewards/sentence_structure_reward/std": 0.0030023243743926287, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.00241813803371404, + "grad_norm": 0.3650098741054535, + "kl": 0.047674717381596565, + "learning_rate": 3.522891916738269e-05, + "loss": 0.0019, + "num_tokens": 448786.0, + "reward": 166.59751892089844, + "reward_std": 89.5423812866211, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.0058586373925209045, + "rewards/sentence_structure_reward/std": 0.0037579357158392668, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0024381226455629165, + "grad_norm": 0.37107303738594055, + "kl": 0.05135812680236995, + "learning_rate": 3.514548552952211e-05, + "loss": 0.0021, + "num_tokens": 453554.0, + "reward": 151.68955993652344, + "reward_std": 59.68627166748047, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.008698396384716034, + "rewards/sentence_structure_reward/std": 0.0020444157999008894, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002458107257411793, + "grad_norm": 0.3555949330329895, + "kl": 0.04799520131200552, + "learning_rate": 3.5061429320072225e-05, + "loss": 0.0019, + "num_tokens": 457654.0, + "reward": 226.30392456054688, + "reward_std": 64.29908752441406, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 46.875, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.0088924216106534, + "rewards/sentence_structure_reward/std": 0.002987799933180213, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0024780918692606695, + "grad_norm": 0.39551085233688354, + "kl": 0.06357492925599217, + "learning_rate": 3.497675399425456e-05, + "loss": 0.0029, + "num_tokens": 461258.0, + "reward": 136.76327514648438, + "reward_std": 29.849214553833008, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.008000846952199936, + "rewards/sentence_structure_reward/std": 0.0018053074600175023, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0024980764811095455, + "grad_norm": 0.42647117376327515, + "kl": 0.051036563934758306, + "learning_rate": 3.489146303274014e-05, + "loss": 0.002, + "num_tokens": 465722.0, + "reward": 136.77685546875, + "reward_std": 29.844858169555664, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.010623043403029442, + "rewards/sentence_structure_reward/std": 0.009008396416902542, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002518061092958422, + "grad_norm": 0.41318488121032715, + "kl": 0.15331217553466558, + "learning_rate": 3.480555994150631e-05, + "loss": 0.0061, + "num_tokens": 469498.0, + "reward": 136.76239013671875, + "reward_std": 29.850582122802734, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.007829926908016205, + "rewards/sentence_structure_reward/std": 0.0022687313612550497, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0025380457048072985, + "grad_norm": 0.39885494112968445, + "kl": 0.04930273303762078, + "learning_rate": 3.4719048251692705e-05, + "loss": 0.002, + "num_tokens": 473378.0, + "reward": 151.6854705810547, + "reward_std": 59.691322326660156, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.007910327985882759, + "rewards/sentence_structure_reward/std": 0.0036057187244296074, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0025580303166561746, + "grad_norm": 0.3425118029117584, + "kl": 0.057912896387279034, + "learning_rate": 3.463193151945603e-05, + "loss": 0.002, + "num_tokens": 476622.0, + "reward": 121.84071350097656, + "reward_std": 0.01306266337633133, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.008025456219911575, + "rewards/sentence_structure_reward/std": 0.0026056712958961725, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002578014928505051, + "grad_norm": 0.3540816605091095, + "kl": 0.038455577567219734, + "learning_rate": 3.4544213325823945e-05, + "loss": 0.0016, + "num_tokens": 480358.0, + "reward": 166.5975799560547, + "reward_std": 57.159095764160156, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.005871741101145744, + "rewards/sentence_structure_reward/std": 0.002830666955560446, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0025979995403539275, + "grad_norm": 1.0193675756454468, + "kl": 0.05130916018970311, + "learning_rate": 3.4455897276547836e-05, + "loss": 0.0022, + "num_tokens": 483470.0, + "reward": 121.83052062988281, + "reward_std": 0.01784432679414749, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.006056672893464565, + "rewards/sentence_structure_reward/std": 0.004217956680804491, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002617984152202804, + "grad_norm": 0.3701932728290558, + "kl": 0.05585270980373025, + "learning_rate": 3.4366987001954555e-05, + "loss": 0.0023, + "num_tokens": 486750.0, + "reward": 121.83204650878906, + "reward_std": 0.011149849742650986, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.006351416930556297, + "rewards/sentence_structure_reward/std": 0.002616150537505746, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.00263796876405168, + "grad_norm": 0.38405516743659973, + "kl": 0.0486052380874753, + "learning_rate": 3.4277486156797264e-05, + "loss": 0.0019, + "num_tokens": 490874.0, + "reward": 181.527099609375, + "reward_std": 0.007388896308839321, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.007194197736680508, + "rewards/sentence_structure_reward/std": 0.0016285466263070703, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0026579533759005566, + "grad_norm": 0.2864632308483124, + "kl": 0.03462736704386771, + "learning_rate": 3.418739842010516e-05, + "loss": 0.0016, + "num_tokens": 494242.0, + "reward": 151.69102478027344, + "reward_std": 34.46450424194336, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.008980686776340008, + "rewards/sentence_structure_reward/std": 0.001620158669538796, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002677937987749433, + "grad_norm": 0.384237140417099, + "kl": 0.04695027705747634, + "learning_rate": 3.409672749503224e-05, + "loss": 0.0019, + "num_tokens": 497458.0, + "reward": 151.6789093017578, + "reward_std": 34.472320556640625, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.006642803084105253, + "rewards/sentence_structure_reward/std": 0.003306509694084525, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002697922599598309, + "grad_norm": 0.2865605056285858, + "kl": 0.030871245253365487, + "learning_rate": 3.40054771087051e-05, + "loss": 0.0011, + "num_tokens": 500854.0, + "reward": 121.84052276611328, + "reward_std": 0.01419970951974392, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.00798724964261055, + "rewards/sentence_structure_reward/std": 0.0036812182515859604, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0027179072114471856, + "grad_norm": 0.36733871698379517, + "kl": 0.031645517563447356, + "learning_rate": 3.391365101206973e-05, + "loss": 0.0013, + "num_tokens": 505042.0, + "reward": 151.3797149658203, + "reward_std": 83.6871337890625, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 17.677669525146484, + "rewards/sentence_structure_reward/mean": 0.007227178663015366, + "rewards/sentence_structure_reward/std": 0.003298749914392829, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002737891823296062, + "grad_norm": 0.3747429847717285, + "kl": 0.02731521538225934, + "learning_rate": 3.38212529797373e-05, + "loss": 0.0011, + "num_tokens": 508694.0, + "reward": 136.4548797607422, + "reward_std": 79.08283996582031, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.006811157800257206, + "rewards/sentence_structure_reward/std": 0.0031362453009933233, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0027578764351449386, + "grad_norm": 0.3574497699737549, + "kl": 0.03841200331225991, + "learning_rate": 3.372828680982901e-05, + "loss": 0.0015, + "num_tokens": 512010.0, + "reward": 166.58914184570312, + "reward_std": 64.31922912597656, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.004241541959345341, + "rewards/sentence_structure_reward/std": 0.003570870729163289, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0027778610469938146, + "grad_norm": 0.33894309401512146, + "kl": 0.034148502163589, + "learning_rate": 3.363475632381999e-05, + "loss": 0.0014, + "num_tokens": 515254.0, + "reward": 196.45608520507812, + "reward_std": 29.845003128051758, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.008409542962908745, + "rewards/sentence_structure_reward/std": 0.0011043510166928172, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002797845658842691, + "grad_norm": 0.3846226632595062, + "kl": 0.03369302162900567, + "learning_rate": 3.35406653663822e-05, + "loss": 0.0013, + "num_tokens": 519286.0, + "reward": 136.75193786621094, + "reward_std": 29.851266860961914, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.005813508294522762, + "rewards/sentence_structure_reward/std": 0.002508406527340412, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0028178302706915676, + "grad_norm": 0.655211329460144, + "kl": 0.02145983651280403, + "learning_rate": 3.344601780522634e-05, + "loss": 0.0009, + "num_tokens": 522870.0, + "reward": 136.46621704101562, + "reward_std": 79.06263732910156, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.0089985067024827, + "rewards/sentence_structure_reward/std": 0.003473785240203142, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002837814882540444, + "grad_norm": 0.39196979999542236, + "kl": 0.029069664422422647, + "learning_rate": 3.3350817530942964e-05, + "loss": 0.0012, + "num_tokens": 526542.0, + "reward": 166.600341796875, + "reward_std": 64.3072280883789, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.006404052954167128, + "rewards/sentence_structure_reward/std": 0.0010217403760179877, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.00285779949438932, + "grad_norm": 0.3450421392917633, + "kl": 0.02962247427785769, + "learning_rate": 3.325506845684246e-05, + "loss": 0.0014, + "num_tokens": 530130.0, + "reward": 106.61236572265625, + "reward_std": 30.45612907409668, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.007357136346399784, + "rewards/sentence_structure_reward/std": 0.0016827801009640098, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0028777841062381966, + "grad_norm": 0.2804088294506073, + "kl": 0.022439823718741536, + "learning_rate": 3.315877451879426e-05, + "loss": 0.001, + "num_tokens": 534062.0, + "reward": 136.76754760742188, + "reward_std": 29.85065269470215, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.00882734265178442, + "rewards/sentence_structure_reward/std": 0.0020659053698182106, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002897768718087073, + "grad_norm": 0.3151808977127075, + "kl": 0.0315976400161162, + "learning_rate": 3.3061939675064974e-05, + "loss": 0.0012, + "num_tokens": 537814.0, + "reward": 151.67877197265625, + "reward_std": 34.4713249206543, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.006618655286729336, + "rewards/sentence_structure_reward/std": 0.002911181654781103, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002917753329935949, + "grad_norm": 0.39217609167099, + "kl": 0.02523262402974069, + "learning_rate": 3.2964567906155775e-05, + "loss": 0.0018, + "num_tokens": 540742.0, + "reward": 136.74742126464844, + "reward_std": 29.853694915771484, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.004942512605339289, + "rewards/sentence_structure_reward/std": 0.003126244293525815, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0029377379417848256, + "grad_norm": 0.31743162870407104, + "kl": 0.03163698536809534, + "learning_rate": 3.2866663214638685e-05, + "loss": 0.0013, + "num_tokens": 544038.0, + "reward": 166.30084228515625, + "reward_std": 107.3675765991211, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 22.903135299682617, + "rewards/sentence_structure_reward/mean": 0.006928788498044014, + "rewards/sentence_structure_reward/std": 0.0021290613804012537, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.002957722553633702, + "grad_norm": 0.34987640380859375, + "kl": 0.0206563692772761, + "learning_rate": 3.276822962499211e-05, + "loss": 0.0008, + "num_tokens": 547478.0, + "reward": 106.00260925292969, + "reward_std": 107.00798034667969, + "rewards/keyword_presence_reward/mean": 0.625, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 20.863075256347656, + "rewards/sentence_structure_reward/mean": 0.00632974598556757, + "rewards/sentence_structure_reward/std": 0.0027430481277406216, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0029777071654825786, + "grad_norm": 0.35329344868659973, + "kl": 0.017734386841766536, + "learning_rate": 3.26692711834354e-05, + "loss": 0.0007, + "num_tokens": 550926.0, + "reward": 106.30282592773438, + "reward_std": 79.68246459960938, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.005944008473306894, + "rewards/sentence_structure_reward/std": 0.0027581381145864725, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0029976917773314547, + "grad_norm": 0.30811476707458496, + "kl": 0.021204006567131728, + "learning_rate": 3.256979195776247e-05, + "loss": 0.0008, + "num_tokens": 554370.0, + "reward": 121.53272247314453, + "reward_std": 60.29393005371094, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.0069087776355445385, + "rewards/sentence_structure_reward/std": 0.0035726975183933973, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 253.25, + "completions/mean_terminated_length": 234.0, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.003017676389180331, + "grad_norm": 0.3305506110191345, + "kl": 0.019509048433974385, + "learning_rate": 3.246979603717467e-05, + "loss": 0.0008, + "num_tokens": 557848.0, + "reward": 91.08367919921875, + "reward_std": 60.29705047607422, + "rewards/keyword_presence_reward/mean": 0.625, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 18.75, + "rewards/reward_keyword_presence/std": 17.677669525146484, + "rewards/sentence_structure_reward/mean": 0.007052146829664707, + "rewards/sentence_structure_reward/std": 0.0010830545797944069, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0030376610010292076, + "grad_norm": 0.40478426218032837, + "kl": 0.01801724242977798, + "learning_rate": 3.236928753211263e-05, + "loss": 0.0007, + "num_tokens": 561972.0, + "reward": 121.53172302246094, + "reward_std": 60.29460144042969, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.006717929150909185, + "rewards/sentence_structure_reward/std": 0.0013856986770406365, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0030576456128780837, + "grad_norm": 0.3242015838623047, + "kl": 0.0155667137587443, + "learning_rate": 3.2268270574087336e-05, + "loss": 0.0006, + "num_tokens": 565160.0, + "reward": 76.15022277832031, + "reward_std": 65.6033935546875, + "rewards/keyword_presence_reward/mean": 0.625, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 15.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.0049728574231266975, + "rewards/sentence_structure_reward/std": 0.0031709226313978434, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.00307763022472696, + "grad_norm": 0.3642282485961914, + "kl": 0.023375058663077652, + "learning_rate": 3.2166749315510265e-05, + "loss": 0.0009, + "num_tokens": 568464.0, + "reward": 106.31480407714844, + "reward_std": 79.6802978515625, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.008254818618297577, + "rewards/sentence_structure_reward/std": 0.0021004669833928347, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0030976148365758367, + "grad_norm": 0.32533693313598633, + "kl": 0.02561825350858271, + "learning_rate": 3.206472792952273e-05, + "loss": 0.0009, + "num_tokens": 572364.0, + "reward": 121.53921508789062, + "reward_std": 49.236595153808594, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.008163060992956161, + "rewards/sentence_structure_reward/std": 0.001503294799476862, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.003117599448424713, + "grad_norm": 0.314354807138443, + "kl": 0.02495297184213996, + "learning_rate": 3.196221060982432e-05, + "loss": 0.001, + "num_tokens": 576132.0, + "reward": 196.43869018554688, + "reward_std": 64.31953430175781, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.005052807740867138, + "rewards/sentence_structure_reward/std": 0.0031828756909817457, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.003137584060273589, + "grad_norm": 0.4229791462421417, + "kl": 0.026373344706371427, + "learning_rate": 3.185920157050052e-05, + "loss": 0.0011, + "num_tokens": 580644.0, + "reward": 121.53523254394531, + "reward_std": 60.28870391845703, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.007394583895802498, + "rewards/sentence_structure_reward/std": 0.0034175189211964607, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 248.5, + "completions/mean_terminated_length": 196.0, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.0031575686721224657, + "grad_norm": 0.38771048188209534, + "kl": 0.023568642791360617, + "learning_rate": 3.1755705045849465e-05, + "loss": 0.0009, + "num_tokens": 584356.0, + "reward": 136.1562957763672, + "reward_std": 92.24603271484375, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 20.863075256347656, + "rewards/sentence_structure_reward/mean": 0.007511806208640337, + "rewards/sentence_structure_reward/std": 0.0016839586896821856, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.003177553283971342, + "grad_norm": 0.473152756690979, + "kl": 0.03606040868908167, + "learning_rate": 3.1651725290207923e-05, + "loss": 0.0014, + "num_tokens": 587776.0, + "reward": 121.23066711425781, + "reward_std": 87.62784576416016, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 18.898223876953125, + "rewards/sentence_structure_reward/mean": 0.006940416991710663, + "rewards/sentence_structure_reward/std": 0.000662890262901783, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0031975378958202186, + "grad_norm": 0.7930582165718079, + "kl": 0.027441245852969587, + "learning_rate": 3.1547266577776395e-05, + "loss": 0.0018, + "num_tokens": 591680.0, + "reward": 121.85542297363281, + "reward_std": 0.016292022541165352, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 0.0, + "rewards/sentence_structure_reward/mean": 0.010862302035093307, + "rewards/sentence_structure_reward/std": 0.00580249261111021, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0032175225076690947, + "grad_norm": 0.3321499824523926, + "kl": 0.04163651866838336, + "learning_rate": 3.1442333202443394e-05, + "loss": 0.0017, + "num_tokens": 595156.0, + "reward": 136.4522247314453, + "reward_std": 79.07417297363281, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.006296558305621147, + "rewards/sentence_structure_reward/std": 0.0027378115337342024, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.003237507119517971, + "grad_norm": 0.3406222462654114, + "kl": 0.02945030457340181, + "learning_rate": 3.1336929477609e-05, + "loss": 0.0012, + "num_tokens": 598972.0, + "reward": 136.46539306640625, + "reward_std": 79.0766830444336, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.0088377445936203, + "rewards/sentence_structure_reward/std": 0.0013410788960754871, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0032574917313668477, + "grad_norm": 0.4192326068878174, + "kl": 0.02450850000604987, + "learning_rate": 3.12310597360075e-05, + "loss": 0.001, + "num_tokens": 601916.0, + "reward": 136.7701873779297, + "reward_std": 29.857532501220703, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.009335490874946117, + "rewards/sentence_structure_reward/std": 0.004011332057416439, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0032774763432157237, + "grad_norm": 0.3153103291988373, + "kl": 0.028544869273900986, + "learning_rate": 3.11247283295293e-05, + "loss": 0.0011, + "num_tokens": 605540.0, + "reward": 196.45556640625, + "reward_std": 78.58583068847656, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.008309446275234222, + "rewards/sentence_structure_reward/std": 0.002167526865378022, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0032974609550646, + "grad_norm": 0.3131996989250183, + "kl": 0.025787735707126558, + "learning_rate": 3.101793962904205e-05, + "loss": 0.001, + "num_tokens": 609408.0, + "reward": 136.75836181640625, + "reward_std": 29.858123779296875, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.007055344991385937, + "rewards/sentence_structure_reward/std": 0.004500496666878462, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0033174455669134767, + "grad_norm": 0.44056662917137146, + "kl": 0.03679174673743546, + "learning_rate": 3.0910698024210976e-05, + "loss": 0.0015, + "num_tokens": 612816.0, + "reward": 151.6786346435547, + "reward_std": 59.6875114440918, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.006590413860976696, + "rewards/sentence_structure_reward/std": 0.0011712894774973392, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.003337430178762353, + "grad_norm": 0.3410772383213043, + "kl": 0.025147033855319023, + "learning_rate": 3.0803007923318404e-05, + "loss": 0.001, + "num_tokens": 617508.0, + "reward": 166.31805419921875, + "reward_std": 109.96865844726562, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 22.903135299682617, + "rewards/sentence_structure_reward/mean": 0.010251899249851704, + "rewards/sentence_structure_reward/std": 0.0033587468788027763, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0033574147906112292, + "grad_norm": 0.3396851420402527, + "kl": 0.044431361136958, + "learning_rate": 3.0694873753082597e-05, + "loss": 0.0018, + "num_tokens": 620868.0, + "reward": 166.59840393066406, + "reward_std": 64.31353759765625, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.006029421463608742, + "rewards/sentence_structure_reward/std": 0.002521348651498556, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0033773994024601057, + "grad_norm": 0.40759602189064026, + "kl": 0.05500357458367944, + "learning_rate": 3.058629995847575e-05, + "loss": 0.0022, + "num_tokens": 624140.0, + "reward": 151.68435668945312, + "reward_std": 59.69160842895508, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.007696100976318121, + "rewards/sentence_structure_reward/std": 0.0017742906929925084, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.003397384014308982, + "grad_norm": 0.5917409062385559, + "kl": 0.041553596034646034, + "learning_rate": 3.0477291002541308e-05, + "loss": 0.0017, + "num_tokens": 629092.0, + "reward": 166.30462646484375, + "reward_std": 90.14631652832031, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.007660072296857834, + "rewards/sentence_structure_reward/std": 0.003579641692340374, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0034173686261578583, + "grad_norm": 0.42491018772125244, + "kl": 0.06714614201337099, + "learning_rate": 3.0367851366210507e-05, + "loss": 0.0029, + "num_tokens": 632504.0, + "reward": 151.6785430908203, + "reward_std": 59.694610595703125, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 17.677669525146484, + "rewards/sentence_structure_reward/mean": 0.00657021626830101, + "rewards/sentence_structure_reward/std": 0.003318567993119359, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0034373532380067347, + "grad_norm": 0.432492733001709, + "kl": 0.047549669281579554, + "learning_rate": 3.0257985548118127e-05, + "loss": 0.0019, + "num_tokens": 636496.0, + "reward": 136.45310974121094, + "reward_std": 64.90666198730469, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.006469730753451586, + "rewards/sentence_structure_reward/std": 0.0031976946629583836, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0034573378498556112, + "grad_norm": 0.3530580699443817, + "kl": 0.0458946512080729, + "learning_rate": 3.0147698064417646e-05, + "loss": 0.0018, + "num_tokens": 639704.0, + "reward": 151.38204956054688, + "reward_std": 60.2933464050293, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 17.677669525146484, + "rewards/sentence_structure_reward/mean": 0.007678591646254063, + "rewards/sentence_structure_reward/std": 0.001874069101177156, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0034773224617044877, + "grad_norm": 0.3168887495994568, + "kl": 0.04384193941950798, + "learning_rate": 3.0036993448595555e-05, + "loss": 0.0018, + "num_tokens": 643616.0, + "reward": 151.38218688964844, + "reward_std": 60.303565979003906, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 17.677669525146484, + "rewards/sentence_structure_reward/mean": 0.007704591378569603, + "rewards/sentence_structure_reward/std": 0.003953540697693825, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0034973070735533638, + "grad_norm": 2.052316188812256, + "kl": 0.04324845899827778, + "learning_rate": 2.9925876251285008e-05, + "loss": 0.0011, + "num_tokens": 647024.0, + "reward": 136.4581298828125, + "reward_std": 57.681026458740234, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 28.125, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.007439134642481804, + "rewards/sentence_structure_reward/std": 0.0014148898189887404, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0035172916854022403, + "grad_norm": 0.3174251914024353, + "kl": 0.04775287047959864, + "learning_rate": 2.9814351040078763e-05, + "loss": 0.0019, + "num_tokens": 650568.0, + "reward": 151.3742218017578, + "reward_std": 105.35675048828125, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 22.160131454467773, + "rewards/sentence_structure_reward/mean": 0.006165824364870787, + "rewards/sentence_structure_reward/std": 0.0026151530910283327, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0035372762972511167, + "grad_norm": 0.39383986592292786, + "kl": 0.054818503092974424, + "learning_rate": 2.970242239934144e-05, + "loss": 0.0022, + "num_tokens": 654156.0, + "reward": 211.37579345703125, + "reward_std": 59.68771743774414, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 43.75, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.007838046178221703, + "rewards/sentence_structure_reward/std": 0.00240572402253747, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.003557260909099993, + "grad_norm": 0.4380655288696289, + "kl": 0.04732145392335951, + "learning_rate": 2.959009493002108e-05, + "loss": 0.0019, + "num_tokens": 657712.0, + "reward": 121.543701171875, + "reward_std": 60.30317306518555, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 25.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.00902829971164465, + "rewards/sentence_structure_reward/std": 0.0035202307626605034, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0035772455209488693, + "grad_norm": 0.38974782824516296, + "kl": 0.05213979515247047, + "learning_rate": 2.9477373249459974e-05, + "loss": 0.0021, + "num_tokens": 661896.0, + "reward": 196.44557189941406, + "reward_std": 78.58309173583984, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.006382114719599485, + "rewards/sentence_structure_reward/std": 0.0027868549805134535, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0035972301327977458, + "grad_norm": 0.38800105452537537, + "kl": 0.058865012135356665, + "learning_rate": 2.936426199120492e-05, + "loss": 0.0024, + "num_tokens": 665084.0, + "reward": 166.602294921875, + "reward_std": 64.30633544921875, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.006779382470995188, + "rewards/sentence_structure_reward/std": 0.0007606074213981628, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0036172147446466222, + "grad_norm": 0.39465782046318054, + "kl": 0.06755426968447864, + "learning_rate": 2.9250765804816712e-05, + "loss": 0.0027, + "num_tokens": 668660.0, + "reward": 166.60238647460938, + "reward_std": 64.30355834960938, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.00679972767829895, + "rewards/sentence_structure_reward/std": 0.0010966010158881545, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0036371993564954983, + "grad_norm": 0.47897449135780334, + "kl": 0.054818300530314445, + "learning_rate": 2.9136889355679033e-05, + "loss": 0.0022, + "num_tokens": 673596.0, + "reward": 181.23416137695312, + "reward_std": 111.97261047363281, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 23.1455020904541, + "rewards/sentence_structure_reward/mean": 0.008982768282294273, + "rewards/sentence_structure_reward/std": 0.004853582009673119, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.003657183968344375, + "grad_norm": 0.5901926755905151, + "kl": 0.0712578953243792, + "learning_rate": 2.9022637324806693e-05, + "loss": 0.0029, + "num_tokens": 677272.0, + "reward": 181.52450561523438, + "reward_std": 59.68503952026367, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.006691797636449337, + "rewards/sentence_structure_reward/std": 0.00339295482262969, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0036771685801932513, + "grad_norm": 0.424969881772995, + "kl": 0.0897442139685154, + "learning_rate": 2.8908014408653183e-05, + "loss": 0.0036, + "num_tokens": 680644.0, + "reward": 166.29986572265625, + "reward_std": 92.13768768310547, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.00674265343695879, + "rewards/sentence_structure_reward/std": 0.0008984439773485065, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0036971531920421278, + "grad_norm": 0.37172701954841614, + "kl": 0.07420068560168147, + "learning_rate": 2.8793025318917648e-05, + "loss": 0.003, + "num_tokens": 684096.0, + "reward": 196.4502410888672, + "reward_std": 64.30850219726562, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.007282731588929892, + "rewards/sentence_structure_reward/std": 0.001394122838973999, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.003717137803891004, + "grad_norm": 0.3605819344520569, + "kl": 0.07117353240028024, + "learning_rate": 2.8677674782351164e-05, + "loss": 0.0024, + "num_tokens": 688264.0, + "reward": 226.29774475097656, + "reward_std": 29.851425170898438, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 46.875, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.007700989488512278, + "rewards/sentence_structure_reward/std": 0.001339192851446569, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0037371224157398803, + "grad_norm": 0.8895725011825562, + "kl": 0.07748813484795392, + "learning_rate": 2.8561967540562517e-05, + "loss": 0.0031, + "num_tokens": 692184.0, + "reward": 241.2211151123047, + "reward_std": 59.68316650390625, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 50.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.007833847776055336, + "rewards/sentence_structure_reward/std": 0.0026440576184540987, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0037571070275887568, + "grad_norm": 0.4801936149597168, + "kl": 0.08478311309590936, + "learning_rate": 2.84459083498232e-05, + "loss": 0.0034, + "num_tokens": 695556.0, + "reward": 241.21759033203125, + "reward_std": 59.69258117675781, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 50.0, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.007153892889618874, + "rewards/sentence_structure_reward/std": 0.0008143707527779043, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.003777091639437633, + "grad_norm": 0.38287150859832764, + "kl": 0.0892713381908834, + "learning_rate": 2.8329501980871983e-05, + "loss": 0.0033, + "num_tokens": 699264.0, + "reward": 196.1431884765625, + "reward_std": 57.67967224121094, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.006348989903926849, + "rewards/sentence_structure_reward/std": 0.001027881633490324, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0037970762512865093, + "grad_norm": 0.40273773670196533, + "kl": 0.0767029399285093, + "learning_rate": 2.8212753218718764e-05, + "loss": 0.0031, + "num_tokens": 703344.0, + "reward": 196.448486328125, + "reward_std": 29.85240936279297, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.006942545995116234, + "rewards/sentence_structure_reward/std": 0.0019067238317802548, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.003817060863135386, + "grad_norm": 0.48853445053100586, + "kl": 0.09601349080912769, + "learning_rate": 2.8095666862447876e-05, + "loss": 0.0038, + "num_tokens": 707064.0, + "reward": 196.44796752929688, + "reward_std": 64.30943298339844, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.006843275390565395, + "rewards/sentence_structure_reward/std": 0.0011484987335279584, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0038370454749842623, + "grad_norm": 0.40011832118034363, + "kl": 0.06750158104114234, + "learning_rate": 2.7978247725020837e-05, + "loss": 0.0024, + "num_tokens": 711496.0, + "reward": 211.36935424804688, + "reward_std": 34.46641159057617, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 43.75, + "rewards/reward_keyword_presence/std": 22.160131454467773, + "rewards/sentence_structure_reward/mean": 0.0065943095833063126, + "rewards/sentence_structure_reward/std": 0.0010970140574499965, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0038570300868331383, + "grad_norm": 0.49026256799697876, + "kl": 0.0859145037829876, + "learning_rate": 2.7860500633078475e-05, + "loss": 0.0034, + "num_tokens": 715008.0, + "reward": 196.44631958007812, + "reward_std": 64.30458068847656, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.006527463905513287, + "rewards/sentence_structure_reward/std": 0.0014449454611167312, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.003877014698682015, + "grad_norm": 0.4523202180862427, + "kl": 0.06892016681376845, + "learning_rate": 2.7742430426742567e-05, + "loss": 0.0024, + "num_tokens": 718612.0, + "reward": 196.14312744140625, + "reward_std": 57.68752670288086, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.006336957216262817, + "rewards/sentence_structure_reward/std": 0.0027919181156903505, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0038969993105308913, + "grad_norm": 0.4121406674385071, + "kl": 0.05203078413615003, + "learning_rate": 2.7624041959416835e-05, + "loss": 0.0021, + "num_tokens": 723064.0, + "reward": 106.0084457397461, + "reward_std": 64.91384887695312, + "rewards/keyword_presence_reward/mean": 0.625, + "rewards/keyword_presence_reward/std": 0.5175492167472839, + "rewards/reward_keyword_presence/mean": 21.875, + "rewards/reward_keyword_presence/std": 20.863075256347656, + "rewards/sentence_structure_reward/mean": 0.007457427214831114, + "rewards/sentence_structure_reward/std": 0.0017254067352041602, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 231.5, + "completions/mean_terminated_length": 60.0, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.003916983922379768, + "grad_norm": 0.8556960821151733, + "kl": 0.1010344447568059, + "learning_rate": 2.7505340097587488e-05, + "loss": 0.004, + "num_tokens": 726488.0, + "reward": 181.2310028076172, + "reward_std": 105.34259796142578, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 23.1455020904541, + "rewards/sentence_structure_reward/mean": 0.008374437689781189, + "rewards/sentence_structure_reward/std": 0.00914711132645607, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.003936968534228644, + "grad_norm": 0.4162931740283966, + "kl": 0.10135616781190038, + "learning_rate": 2.738632972062313e-05, + "loss": 0.0041, + "num_tokens": 730684.0, + "reward": 226.2975616455078, + "reward_std": 105.8871078491211, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 46.875, + "rewards/reward_keyword_presence/std": 20.863075256347656, + "rewards/sentence_structure_reward/mean": 0.007664205506443977, + "rewards/sentence_structure_reward/std": 0.0012312040198594332, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.00395695314607752, + "grad_norm": 0.4268065094947815, + "kl": 0.06930369546171278, + "learning_rate": 2.726701572057423e-05, + "loss": 0.0028, + "num_tokens": 735680.0, + "reward": 241.2240753173828, + "reward_std": 114.30000305175781, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 50.0, + "rewards/reward_keyword_presence/std": 23.1455020904541, + "rewards/sentence_structure_reward/mean": 0.008406420238316059, + "rewards/sentence_structure_reward/std": 0.00213907053694129, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.003976937757926397, + "grad_norm": 0.5458014607429504, + "kl": 0.09905706904828548, + "learning_rate": 2.7147403001972023e-05, + "loss": 0.004, + "num_tokens": 739748.0, + "reward": 211.36724853515625, + "reward_std": 59.69007110595703, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 43.75, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.0061907898634672165, + "rewards/sentence_structure_reward/std": 0.0006799095426686108, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.003996922369775273, + "grad_norm": 0.41277816891670227, + "kl": 0.07710113539360464, + "learning_rate": 2.7027496481626858e-05, + "loss": 0.0031, + "num_tokens": 743776.0, + "reward": 196.45208740234375, + "reward_std": 29.853837966918945, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.0076400539837777615, + "rewards/sentence_structure_reward/std": 0.0033419884275645018, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.00401690698162415, + "grad_norm": 0.45092982053756714, + "kl": 0.08694125758484006, + "learning_rate": 2.6907301088426155e-05, + "loss": 0.0035, + "num_tokens": 747588.0, + "reward": 211.3711395263672, + "reward_std": 34.45982360839844, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 43.75, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.006939921993762255, + "rewards/sentence_structure_reward/std": 0.0020985316950827837, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004036891593473026, + "grad_norm": 0.519778847694397, + "kl": 0.12678114511072636, + "learning_rate": 2.6786821763131755e-05, + "loss": 0.0047, + "num_tokens": 750696.0, + "reward": 226.2904052734375, + "reward_std": 57.15570068359375, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 46.875, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.006284583825618029, + "rewards/sentence_structure_reward/std": 0.001313726999796927, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004056876205321902, + "grad_norm": 0.4653032124042511, + "kl": 0.07340941019356251, + "learning_rate": 2.666606345817684e-05, + "loss": 0.0029, + "num_tokens": 754432.0, + "reward": 226.29266357421875, + "reward_std": 91.61013793945312, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 46.875, + "rewards/reward_keyword_presence/std": 20.863075256347656, + "rewards/sentence_structure_reward/mean": 0.006718168966472149, + "rewards/sentence_structure_reward/std": 0.0015461437869817019, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004076860817170779, + "grad_norm": 0.5643277764320374, + "kl": 0.10020781308412552, + "learning_rate": 2.654503113746234e-05, + "loss": 0.004, + "num_tokens": 758536.0, + "reward": 166.60833740234375, + "reward_std": 64.30059814453125, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.007949287071824074, + "rewards/sentence_structure_reward/std": 0.0020666285417973995, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004096845429019655, + "grad_norm": 0.4655681252479553, + "kl": 0.07193906605243683, + "learning_rate": 2.6423729776152917e-05, + "loss": 0.0029, + "num_tokens": 762608.0, + "reward": 151.68113708496094, + "reward_std": 59.689208984375, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.0070740398950874805, + "rewards/sentence_structure_reward/std": 0.0007524870452471077, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004116830040868531, + "grad_norm": 0.48426052927970886, + "kl": 0.07032767264172435, + "learning_rate": 2.630216436047242e-05, + "loss": 0.0028, + "num_tokens": 766176.0, + "reward": 241.21217346191406, + "reward_std": 87.00099182128906, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 50.0, + "rewards/reward_keyword_presence/std": 18.898223876953125, + "rewards/sentence_structure_reward/mean": 0.006106480956077576, + "rewards/sentence_structure_reward/std": 0.0029420522041618824, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004136814652717408, + "grad_norm": 0.5413448214530945, + "kl": 0.09072657534852624, + "learning_rate": 2.618033988749895e-05, + "loss": 0.0036, + "num_tokens": 770112.0, + "reward": 241.21713256835938, + "reward_std": 94.14925384521484, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 50.0, + "rewards/reward_keyword_presence/std": 23.1455020904541, + "rewards/sentence_structure_reward/mean": 0.007065536454319954, + "rewards/sentence_structure_reward/std": 0.002313598059117794, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004156799264566284, + "grad_norm": 0.500718355178833, + "kl": 0.06371236176346429, + "learning_rate": 2.6058261364959444e-05, + "loss": 0.0025, + "num_tokens": 774924.0, + "reward": 196.16754150390625, + "reward_std": 107.35367584228516, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 22.903135299682617, + "rewards/sentence_structure_reward/mean": 0.011048653163015842, + "rewards/sentence_structure_reward/std": 0.007717860396951437, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.00417678387641516, + "grad_norm": 0.46990102529525757, + "kl": 0.0833042012527585, + "learning_rate": 2.5935933811023812e-05, + "loss": 0.0036, + "num_tokens": 778392.0, + "reward": 211.07113647460938, + "reward_std": 77.52779388427734, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 43.75, + "rewards/reward_keyword_presence/std": 22.160131454467773, + "rewards/sentence_structure_reward/mean": 0.007365101017057896, + "rewards/sentence_structure_reward/std": 0.002440233016386628, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004196768488264037, + "grad_norm": 0.44950953125953674, + "kl": 0.0844889055006206, + "learning_rate": 2.5813362254098678e-05, + "loss": 0.0034, + "num_tokens": 782284.0, + "reward": 181.2283477783203, + "reward_std": 87.51434326171875, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 18.898223876953125, + "rewards/sentence_structure_reward/mean": 0.007862258702516556, + "rewards/sentence_structure_reward/std": 0.002410661894828081, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004216753100112913, + "grad_norm": 0.5048360824584961, + "kl": 0.09365172614343464, + "learning_rate": 2.569055173262065e-05, + "loss": 0.0037, + "num_tokens": 786532.0, + "reward": 196.44949340820312, + "reward_std": 64.30047607421875, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.007137882523238659, + "rewards/sentence_structure_reward/std": 0.0018373564817011356, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.00423673771196179, + "grad_norm": 0.4884333312511444, + "kl": 0.06758214044384658, + "learning_rate": 2.556750729484927e-05, + "loss": 0.0024, + "num_tokens": 790212.0, + "reward": 256.14569091796875, + "reward_std": 29.851896286010742, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 53.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.00820031389594078, + "rewards/sentence_structure_reward/std": 0.002503173192963004, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004256722323810666, + "grad_norm": 0.6413646340370178, + "kl": 0.09433773951604962, + "learning_rate": 2.5444233998659424e-05, + "loss": 0.0053, + "num_tokens": 794608.0, + "reward": 196.4485626220703, + "reward_std": 29.84714698791504, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.006959002465009689, + "rewards/sentence_structure_reward/std": 0.00078383315121755, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004276706935659542, + "grad_norm": 0.4600231647491455, + "kl": 0.0742596909403801, + "learning_rate": 2.5320736911333503e-05, + "loss": 0.003, + "num_tokens": 798324.0, + "reward": 181.52706909179688, + "reward_std": 59.68961715698242, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.007189121562987566, + "rewards/sentence_structure_reward/std": 0.0012240216601639986, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004296691547508419, + "grad_norm": 1.3496555089950562, + "kl": 0.08517341781407595, + "learning_rate": 2.519702110935306e-05, + "loss": 0.004, + "num_tokens": 801752.0, + "reward": 256.13848876953125, + "reward_std": 29.849397659301758, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 53.125, + "rewards/reward_keyword_presence/std": 8.838834762573242, + "rewards/sentence_structure_reward/mean": 0.0068114642053842545, + "rewards/sentence_structure_reward/std": 0.0011512238997966051, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004316676159357295, + "grad_norm": 0.516688346862793, + "kl": 0.07948511326685548, + "learning_rate": 2.5073091678190147e-05, + "loss": 0.0032, + "num_tokens": 805756.0, + "reward": 166.2950439453125, + "reward_std": 79.07241821289062, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 34.375, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.005810824688524008, + "rewards/sentence_structure_reward/std": 0.002978959586471319, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004336660771206171, + "grad_norm": 0.7424916625022888, + "kl": 0.08231276413425803, + "learning_rate": 2.494895371209829e-05, + "loss": 0.0033, + "num_tokens": 809388.0, + "reward": 181.52383422851562, + "reward_std": 68.92282104492188, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 13.363062858581543, + "rewards/sentence_structure_reward/mean": 0.006560345180332661, + "rewards/sentence_structure_reward/std": 0.0015788032906129956, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004356645383055048, + "grad_norm": 0.4586159586906433, + "kl": 0.08605947252362967, + "learning_rate": 2.482461231390305e-05, + "loss": 0.0019, + "num_tokens": 812928.0, + "reward": 211.37001037597656, + "reward_std": 34.4599609375, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 43.75, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.006722572259604931, + "rewards/sentence_structure_reward/std": 0.0019025133224204183, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004376629994903924, + "grad_norm": 0.5218976736068726, + "kl": 0.08287978125736117, + "learning_rate": 2.4700072594792307e-05, + "loss": 0.0033, + "num_tokens": 816904.0, + "reward": 241.2115020751953, + "reward_std": 97.47427368164062, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 50.0, + "rewards/reward_keyword_presence/std": 18.898223876953125, + "rewards/sentence_structure_reward/mean": 0.005977953784167767, + "rewards/sentence_structure_reward/std": 0.0025798638816922903, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0043966146067528, + "grad_norm": 0.5248226523399353, + "kl": 0.08689117338508368, + "learning_rate": 2.457533967410611e-05, + "loss": 0.0035, + "num_tokens": 820692.0, + "reward": 271.0598449707031, + "reward_std": 118.87417602539062, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 56.25, + "rewards/reward_keyword_presence/std": 25.877458572387695, + "rewards/sentence_structure_reward/mean": 0.0065536354668438435, + "rewards/sentence_structure_reward/std": 0.0014408943243324757, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004416599218601677, + "grad_norm": 0.6284591555595398, + "kl": 0.08012733934447169, + "learning_rate": 2.445041867912629e-05, + "loss": 0.0032, + "num_tokens": 824684.0, + "reward": 196.44744873046875, + "reward_std": 64.30610656738281, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.00674235075712204, + "rewards/sentence_structure_reward/std": 0.0013805078342556953, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004436583830450553, + "grad_norm": 0.4986615777015686, + "kl": 0.07012146804481745, + "learning_rate": 2.432531474486567e-05, + "loss": 0.0028, + "num_tokens": 829568.0, + "reward": 151.07696533203125, + "reward_std": 126.76100158691406, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 31.25, + "rewards/reward_keyword_presence/std": 25.877458572387695, + "rewards/sentence_structure_reward/mean": 0.007125264033675194, + "rewards/sentence_structure_reward/std": 0.004637881647795439, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004456568442299429, + "grad_norm": 0.5977458357810974, + "kl": 0.08469080179929733, + "learning_rate": 2.4200033013856987e-05, + "loss": 0.0034, + "num_tokens": 833632.0, + "reward": 241.21238708496094, + "reward_std": 86.9915771484375, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 50.0, + "rewards/reward_keyword_presence/std": 18.898223876953125, + "rewards/sentence_structure_reward/mean": 0.0061480277217924595, + "rewards/sentence_structure_reward/std": 0.001013968139886856, + "step": 223 }, { "clip_ratio/high_max": 0.0, @@ -660,21 +6489,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0004421971430219368, - "grad_norm": 0.47010213136672974, - "kl": 0.009743526752572507, - "learning_rate": 3.1755705045849465e-05, - "loss": 0.0004, - "num_tokens": 83931.0, - "reward": 76.16775512695312, - "reward_std": 65.6152114868164, - "rewards/keyword_presence_reward/mean": 0.625, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 15.625, - "rewards/reward_keyword_presence/std": 12.938729286193848, - "rewards/sentence_structure_reward/mean": 0.008356106467545033, - "rewards/sentence_structure_reward/std": 0.005750824231654406, - "step": 23 + "epoch": 0.004476553054148306, + "grad_norm": 0.509591817855835, + "kl": 0.08169552672188729, + "learning_rate": 2.4074578635941513e-05, + "loss": 0.0033, + "num_tokens": 837084.0, + "reward": 181.22654724121094, + "reward_std": 111.99308776855469, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 23.1455020904541, + "rewards/sentence_structure_reward/mean": 0.0075145079754292965, + "rewards/sentence_structure_reward/std": 0.0023957956582307816, + "step": 224 }, { "clip_ratio/high_max": 0.0, @@ -689,21 +6518,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.000461423105762021, - "grad_norm": 0.4251878261566162, - "kl": 0.010788556712213904, - "learning_rate": 3.0449971294318977e-05, - "loss": 0.0004, - "num_tokens": 87707.0, - "reward": 45.720977783203125, - "reward_std": 65.60011291503906, - "rewards/keyword_presence_reward/mean": 0.375, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 9.375, - "rewards/reward_keyword_presence/std": 12.938729286193848, - "rewards/sentence_structure_reward/mean": 0.008936258032917976, - "rewards/sentence_structure_reward/std": 0.0032091650646179914, - "step": 24 + "epoch": 0.004496537665997182, + "grad_norm": 0.4702901244163513, + "kl": 0.08745963964611292, + "learning_rate": 2.3948956768057344e-05, + "loss": 0.0035, + "num_tokens": 840980.0, + "reward": 211.3868408203125, + "reward_std": 86.97686004638672, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 43.75, + "rewards/reward_keyword_presence/std": 17.677669525146484, + "rewards/sentence_structure_reward/mean": 0.009972295723855495, + "rewards/sentence_structure_reward/std": 0.009471634402871132, + "step": 225 }, { "clip_ratio/high_max": 0.0, @@ -718,21 +6547,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.00048064906850210523, - "grad_norm": 0.41720154881477356, - "kl": 0.013568728463724256, - "learning_rate": 2.9079809994790937e-05, - "loss": 0.0005, - "num_tokens": 90707.0, - "reward": 76.15437316894531, - "reward_std": 65.6170654296875, - "rewards/keyword_presence_reward/mean": 0.625, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 15.625, - "rewards/reward_keyword_presence/std": 12.938729286193848, - "rewards/sentence_structure_reward/mean": 0.005773636512458324, - "rewards/sentence_structure_reward/std": 0.002572158118709922, - "step": 25 + "epoch": 0.004516522277846059, + "grad_norm": 0.4319726526737213, + "kl": 0.10563488863408566, + "learning_rate": 2.382317257402745e-05, + "loss": 0.0042, + "num_tokens": 844756.0, + "reward": 271.060791015625, + "reward_std": 59.689903259277344, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 56.25, + "rewards/reward_keyword_presence/std": 11.57275104522705, + "rewards/sentence_structure_reward/mean": 0.006735933944582939, + "rewards/sentence_structure_reward/std": 0.0010385764762759209, + "step": 226 }, { "clip_ratio/high_max": 0.0, @@ -747,21 +6576,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0004998750312421895, - "grad_norm": 0.40808844566345215, - "kl": 0.006949493603315204, - "learning_rate": 2.7653668647301797e-05, - "loss": 0.0003, - "num_tokens": 94331.0, - "reward": 45.403167724609375, - "reward_std": 90.74295806884766, - "rewards/keyword_presence_reward/mean": 0.25, - "rewards/keyword_presence_reward/std": 0.4629100561141968, - "rewards/reward_keyword_presence/mean": 9.375, - "rewards/reward_keyword_presence/std": 18.600595474243164, - "rewards/sentence_structure_reward/mean": 0.005927084479480982, - "rewards/sentence_structure_reward/std": 0.0026827706024050713, - "step": 26 + "epoch": 0.004536506889694935, + "grad_norm": 0.48102718591690063, + "kl": 0.0932152452878654, + "learning_rate": 2.3697231224347378e-05, + "loss": 0.0037, + "num_tokens": 848312.0, + "reward": 226.29766845703125, + "reward_std": 64.30912780761719, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 46.875, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.007683071307837963, + "rewards/sentence_structure_reward/std": 0.0020469529554247856, + "step": 227 }, { "clip_ratio/high_max": 0.0, @@ -776,21 +6605,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0005191009939822736, - "grad_norm": 0.5842301249504089, - "kl": 0.011005887645296752, - "learning_rate": 2.618033988749895e-05, - "loss": 0.0004, - "num_tokens": 97963.0, - "reward": 30.492992401123047, - "reward_std": 60.892799377441406, - "rewards/keyword_presence_reward/mean": 0.25, - "rewards/keyword_presence_reward/std": 0.4629100561141968, - "rewards/reward_keyword_presence/mean": 6.25, - "rewards/reward_keyword_presence/std": 11.57275104522705, - "rewards/sentence_structure_reward/mean": 0.008338943123817444, - "rewards/sentence_structure_reward/std": 0.003053515451028943, - "step": 27 + "epoch": 0.004556491501543811, + "grad_norm": 0.5381566882133484, + "kl": 0.11695172544568777, + "learning_rate": 2.3571137895972735e-05, + "loss": 0.0047, + "num_tokens": 852624.0, + "reward": 181.21868896484375, + "reward_std": 87.52096557617188, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 18.898223876953125, + "rewards/sentence_structure_reward/mean": 0.005998874083161354, + "rewards/sentence_structure_reward/std": 0.0002707337844185531, + "step": 228 }, { "clip_ratio/high_max": 0.0, @@ -805,21 +6634,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0005383269567223579, - "grad_norm": 0.3839789628982544, - "kl": 0.012193127418868244, - "learning_rate": 2.4668907277118114e-05, - "loss": 0.0005, - "num_tokens": 101939.0, - "reward": 106.00430297851562, - "reward_std": 92.83302307128906, - "rewards/keyword_presence_reward/mean": 0.625, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 21.875, - "rewards/reward_keyword_presence/std": 20.863075256347656, - "rewards/sentence_structure_reward/mean": 0.00665743462741375, - "rewards/sentence_structure_reward/std": 0.0012659912463277578, - "step": 28 + "epoch": 0.004576476113392688, + "grad_norm": 0.6514174342155457, + "kl": 0.08573433756828308, + "learning_rate": 2.344489777210638e-05, + "loss": 0.0034, + "num_tokens": 856604.0, + "reward": 211.3708953857422, + "reward_std": 86.996337890625, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 43.75, + "rewards/reward_keyword_presence/std": 17.677669525146484, + "rewards/sentence_structure_reward/mean": 0.006892836652696133, + "rewards/sentence_structure_reward/std": 0.001659597852267325, + "step": 229 }, { "clip_ratio/high_max": 0.0, @@ -827,28 +6656,28 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.875, + "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, - "completions/max_terminated_length": 176.0, - "completions/mean_length": 246.0, - "completions/mean_terminated_length": 176.0, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, - "epoch": 0.0005575529194624421, - "grad_norm": 0.4328064024448395, - "kl": 0.014451814175117761, - "learning_rate": 2.312868930080462e-05, - "loss": 0.0006, - "num_tokens": 106435.0, - "reward": 120.92703247070312, - "reward_std": 115.45464324951172, - "rewards/keyword_presence_reward/mean": 0.625, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 25.0, - "rewards/reward_keyword_presence/std": 23.1455020904541, - "rewards/sentence_structure_reward/mean": 0.0066681825555861, - "rewards/sentence_structure_reward/std": 0.0019869457464665174, - "step": 29 + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004596460725241564, + "grad_norm": 0.4567367136478424, + "kl": 0.07012901455163956, + "learning_rate": 2.331851604198536e-05, + "loss": 0.0028, + "num_tokens": 860332.0, + "reward": 180.92190551757812, + "reward_std": 120.04864501953125, + "rewards/keyword_presence_reward/mean": 0.75, + "rewards/keyword_presence_reward/std": 0.4629100561141968, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 26.726125717163086, + "rewards/sentence_structure_reward/mean": 0.007045557722449303, + "rewards/sentence_structure_reward/std": 0.0018652935978025198, + "step": 230 }, { "clip_ratio/high_max": 0.0, @@ -863,21 +6692,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0005767788822025263, - "grad_norm": 0.34823140501976013, - "kl": 0.00843483186326921, - "learning_rate": 2.1569181914556904e-05, - "loss": 0.0003, - "num_tokens": 109903.0, - "reward": 76.16949462890625, - "reward_std": 65.60303497314453, - "rewards/keyword_presence_reward/mean": 0.625, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 15.625, - "rewards/reward_keyword_presence/std": 12.938729286193848, - "rewards/sentence_structure_reward/mean": 0.008690639398992062, - "rewards/sentence_structure_reward/std": 0.0029439690988510847, - "step": 30 + "epoch": 0.00461644533709044, + "grad_norm": 0.42845338582992554, + "kl": 0.07348805747460574, + "learning_rate": 2.3191997900667588e-05, + "loss": 0.0029, + "num_tokens": 863968.0, + "reward": 285.9840087890625, + "reward_std": 91.60697937011719, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 59.375, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.006842734292149544, + "rewards/sentence_structure_reward/std": 0.0019058574689552188, + "step": 231 }, { "clip_ratio/high_max": 0.0, @@ -892,21 +6721,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0005960048449426105, - "grad_norm": 0.3507833182811737, - "kl": 0.008294492959976196, - "learning_rate": 2e-05, - "loss": 0.0003, - "num_tokens": 113755.0, - "reward": 45.74156951904297, - "reward_std": 65.58068084716797, - "rewards/keyword_presence_reward/mean": 0.375, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 9.375, - "rewards/reward_keyword_presence/std": 12.938729286193848, - "rewards/sentence_structure_reward/mean": 0.012909697368741035, - "rewards/sentence_structure_reward/std": 0.008297405205667019, - "step": 31 + "epoch": 0.004636429948939317, + "grad_norm": 0.4537491798400879, + "kl": 0.05531412921845913, + "learning_rate": 2.3065348548818317e-05, + "loss": 0.0022, + "num_tokens": 867440.0, + "reward": 181.22320556640625, + "reward_std": 87.51561737060547, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 18.898223876953125, + "rewards/sentence_structure_reward/mean": 0.00687025673687458, + "rewards/sentence_structure_reward/std": 0.001459274673834443, + "step": 232 }, { "clip_ratio/high_max": 0.0, @@ -921,21 +6750,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0006152308076826947, - "grad_norm": 0.3283151388168335, - "kl": 0.012263400538358837, - "learning_rate": 1.8430818085443106e-05, - "loss": 0.0005, - "num_tokens": 117367.0, - "reward": 75.8531494140625, - "reward_std": 92.94621276855469, - "rewards/keyword_presence_reward/mean": 0.5, - "rewards/keyword_presence_reward/std": 0.5345224738121033, - "rewards/reward_keyword_presence/mean": 15.625, - "rewards/reward_keyword_presence/std": 18.600595474243164, - "rewards/sentence_structure_reward/mean": 0.005965461954474449, - "rewards/sentence_structure_reward/std": 0.00285523384809494, - "step": 32 + "epoch": 0.004656414560788193, + "grad_norm": 0.5737345218658447, + "kl": 0.1612817863933742, + "learning_rate": 2.2938573192496362e-05, + "loss": 0.0065, + "num_tokens": 871668.0, + "reward": 181.2212677001953, + "reward_std": 87.51573181152344, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 18.898223876953125, + "rewards/sentence_structure_reward/mean": 0.006494954228401184, + "rewards/sentence_structure_reward/std": 0.002161997137591243, + "step": 233 }, { "clip_ratio/high_max": 0.0, @@ -943,28 +6772,28 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.875, + "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, - "completions/max_terminated_length": 165.0, - "completions/mean_length": 244.625, - "completions/mean_terminated_length": 165.0, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, - "epoch": 0.0006344567704227789, - "grad_norm": 0.5506321787834167, - "kl": 0.028337979631032795, - "learning_rate": 1.687131069919538e-05, - "loss": 0.0011, - "num_tokens": 121084.0, - "reward": 76.16398620605469, - "reward_std": 30.45020866394043, - "rewards/keyword_presence_reward/mean": 0.625, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 15.625, - "rewards/reward_keyword_presence/std": 12.938729286193848, - "rewards/sentence_structure_reward/mean": 0.007628860417753458, - "rewards/sentence_structure_reward/std": 0.0027101507876068354, - "step": 33 + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004676399172637069, + "grad_norm": 0.6616830229759216, + "kl": 0.10140653047710657, + "learning_rate": 2.281167704294006e-05, + "loss": 0.0041, + "num_tokens": 874896.0, + "reward": 226.289794921875, + "reward_std": 98.77301025390625, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 46.875, + "rewards/reward_keyword_presence/std": 20.863075256347656, + "rewards/sentence_structure_reward/mean": 0.006163044832646847, + "rewards/sentence_structure_reward/std": 0.0013360182056203485, + "step": 234 }, { "clip_ratio/high_max": 0.0, @@ -979,21 +6808,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0006536827331628632, - "grad_norm": 0.41687554121017456, - "kl": 0.008350938325747848, - "learning_rate": 1.53310927228819e-05, - "loss": 0.0003, - "num_tokens": 124884.0, - "reward": 75.85450744628906, - "reward_std": 79.6773910522461, - "rewards/keyword_presence_reward/mean": 0.5, - "rewards/keyword_presence_reward/std": 0.5345224738121033, - "rewards/reward_keyword_presence/mean": 15.625, - "rewards/reward_keyword_presence/std": 18.600595474243164, - "rewards/sentence_structure_reward/mean": 0.006227576173841953, - "rewards/sentence_structure_reward/std": 0.003205387620255351, - "step": 34 + "epoch": 0.004696383784485946, + "grad_norm": 0.7217199206352234, + "kl": 0.11736772675067186, + "learning_rate": 2.2684665316353112e-05, + "loss": 0.0047, + "num_tokens": 879164.0, + "reward": 211.3701934814453, + "reward_std": 59.69645690917969, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 43.75, + "rewards/reward_keyword_presence/std": 17.677669525146484, + "rewards/sentence_structure_reward/mean": 0.00675593689084053, + "rewards/sentence_structure_reward/std": 0.0034250360913574696, + "step": 235 }, { "clip_ratio/high_max": 0.0, @@ -1008,21 +6837,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0006729086959029473, - "grad_norm": 0.40423351526260376, - "kl": 0.011719608766725287, - "learning_rate": 1.3819660112501054e-05, - "loss": 0.0005, - "num_tokens": 128708.0, - "reward": 91.10885620117188, - "reward_std": 84.36630249023438, - "rewards/keyword_presence_reward/mean": 0.625, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 18.75, - "rewards/reward_keyword_presence/std": 17.677669525146484, - "rewards/sentence_structure_reward/mean": 0.011911422945559025, - "rewards/sentence_structure_reward/std": 0.00928829237818718, - "step": 35 + "epoch": 0.004716368396334822, + "grad_norm": 0.7768247127532959, + "kl": 0.07498306594789028, + "learning_rate": 2.2557543233690122e-05, + "loss": 0.003, + "num_tokens": 882896.0, + "reward": 211.06954956054688, + "reward_std": 87.51898193359375, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 43.75, + "rewards/reward_keyword_presence/std": 22.160131454467773, + "rewards/sentence_structure_reward/mean": 0.007059835828840733, + "rewards/sentence_structure_reward/std": 0.0014004630502313375, + "step": 236 }, { "clip_ratio/high_max": 0.0, @@ -1032,26 +6861,26 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, - "completions/max_terminated_length": 170.0, - "completions/mean_length": 245.25, - "completions/mean_terminated_length": 170.0, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, - "epoch": 0.0006921346586430316, - "grad_norm": 0.3921840190887451, - "kl": 0.012377139064483345, - "learning_rate": 1.2346331352698206e-05, - "loss": 0.0005, - "num_tokens": 131982.0, - "reward": 91.3807601928711, - "reward_std": 60.89781951904297, - "rewards/keyword_presence_reward/mean": 0.75, - "rewards/keyword_presence_reward/std": 0.4629100561141968, - "rewards/reward_keyword_presence/mean": 18.75, - "rewards/reward_keyword_presence/std": 11.57275104522705, - "rewards/sentence_structure_reward/mean": 0.006061872001737356, - "rewards/sentence_structure_reward/std": 0.0010733804665505886, - "step": 36 + "completions/max_terminated_length": 42.0, + "completions/mean_length": 229.25, + "completions/mean_terminated_length": 42.0, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.004736353008183699, + "grad_norm": 1.6196120977401733, + "kl": 0.11006879527121782, + "learning_rate": 2.243031602044201e-05, + "loss": 0.0044, + "num_tokens": 885970.0, + "reward": 255.84967041015625, + "reward_std": 114.8004379272461, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 53.125, + "rewards/reward_keyword_presence/std": 24.775779724121094, + "rewards/sentence_structure_reward/mean": 0.009396668523550034, + "rewards/sentence_structure_reward/std": 0.009449242614209652, + "step": 237 }, { "clip_ratio/high_max": 0.0, @@ -1066,21 +6895,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0007113606213831157, - "grad_norm": 0.4070833921432495, - "kl": 0.013891480572056025, - "learning_rate": 1.0920190005209066e-05, - "loss": 0.0006, - "num_tokens": 135790.0, - "reward": 45.402992248535156, - "reward_std": 57.78147888183594, - "rewards/keyword_presence_reward/mean": 0.25, - "rewards/keyword_presence_reward/std": 0.4629100561141968, - "rewards/reward_keyword_presence/mean": 9.375, - "rewards/reward_keyword_presence/std": 18.600595474243164, - "rewards/sentence_structure_reward/mean": 0.005893459543585777, - "rewards/sentence_structure_reward/std": 0.000792782346252352, - "step": 37 + "epoch": 0.004756337620032575, + "grad_norm": 0.5022542476654053, + "kl": 0.093661118298769, + "learning_rate": 2.2302988906421192e-05, + "loss": 0.0037, + "num_tokens": 889810.0, + "reward": 315.82916259765625, + "reward_std": 64.30420684814453, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 65.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.006808387115597725, + "rewards/sentence_structure_reward/std": 0.0015166359953582287, + "step": 238 }, { "clip_ratio/high_max": 0.0, @@ -1095,21 +6924,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0007305865841232, - "grad_norm": 0.3829093873500824, - "kl": 0.013881997758289799, - "learning_rate": 9.550028705681024e-06, - "loss": 0.0006, - "num_tokens": 139550.0, - "reward": 45.71242141723633, - "reward_std": 30.45193862915039, - "rewards/keyword_presence_reward/mean": 0.375, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 9.375, - "rewards/reward_keyword_presence/std": 12.938729286193848, - "rewards/sentence_structure_reward/mean": 0.0072850072756409645, - "rewards/sentence_structure_reward/std": 0.0008341627544723451, - "step": 38 + "epoch": 0.004776322231881451, + "grad_norm": 0.49466824531555176, + "kl": 0.06707307789474726, + "learning_rate": 2.217556712554662e-05, + "loss": 0.0027, + "num_tokens": 893218.0, + "reward": 196.44818115234375, + "reward_std": 78.58013916015625, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 40.625, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.006885595619678497, + "rewards/sentence_structure_reward/std": 0.001622700714506209, + "step": 239 }, { "clip_ratio/high_max": 0.0, @@ -1124,21 +6953,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0007498125468632841, - "grad_norm": 0.49871861934661865, - "kl": 0.016999104409478605, - "learning_rate": 8.24429495415054e-06, - "loss": 0.0007, - "num_tokens": 143686.0, - "reward": 76.16342163085938, - "reward_std": 65.61138916015625, - "rewards/keyword_presence_reward/mean": 0.625, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 15.625, - "rewards/reward_keyword_presence/std": 12.938729286193848, - "rewards/sentence_structure_reward/mean": 0.007519744802266359, - "rewards/sentence_structure_reward/std": 0.0022481370251625776, - "step": 39 + "epoch": 0.004796306843730328, + "grad_norm": 0.5311232209205627, + "kl": 0.11908868828322738, + "learning_rate": 2.2048055915628626e-05, + "loss": 0.0048, + "num_tokens": 896506.0, + "reward": 225.98851013183594, + "reward_std": 124.40483093261719, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 46.875, + "rewards/reward_keyword_presence/std": 24.775779724121094, + "rewards/sentence_structure_reward/mean": 0.0063459258526563644, + "rewards/sentence_structure_reward/std": 0.0010990251321345568, + "step": 240 }, { "clip_ratio/high_max": 0.0, @@ -1153,21 +6982,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0007690385096033684, - "grad_norm": 0.5383055806159973, - "kl": 0.016338131797965616, - "learning_rate": 7.01103903339633e-06, - "loss": 0.0007, - "num_tokens": 147514.0, - "reward": 106.30233764648438, - "reward_std": 79.68934631347656, - "rewards/keyword_presence_reward/mean": 0.75, - "rewards/keyword_presence_reward/std": 0.4629100561141968, - "rewards/reward_keyword_presence/mean": 21.875, + "epoch": 0.004816291455579204, + "grad_norm": 0.47815972566604614, + "kl": 0.0977805107831955, + "learning_rate": 2.1920460518153637e-05, + "loss": 0.0039, + "num_tokens": 900390.0, + "reward": 226.28955078125, + "reward_std": 78.57991790771484, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 46.875, "rewards/reward_keyword_presence/std": 16.02174949645996, - "rewards/sentence_structure_reward/mean": 0.0058500999584794044, - "rewards/sentence_structure_reward/std": 0.002530187601223588, - "step": 40 + "rewards/sentence_structure_reward/mean": 0.0061183348298072815, + "rewards/sentence_structure_reward/std": 0.0011620632139965892, + "step": 241 }, { "clip_ratio/high_max": 0.0, @@ -1182,21 +7011,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0007882644723434525, - "grad_norm": 1.2685279846191406, - "kl": 0.01337557251099497, - "learning_rate": 5.857864376269051e-06, - "loss": 0.0005, - "num_tokens": 151494.0, - "reward": 91.38697814941406, - "reward_std": 60.903587341308594, - "rewards/keyword_presence_reward/mean": 0.75, - "rewards/keyword_presence_reward/std": 0.4629100561141968, - "rewards/reward_keyword_presence/mean": 18.75, - "rewards/reward_keyword_presence/std": 11.57275104522705, - "rewards/sentence_structure_reward/mean": 0.007261113729327917, - "rewards/sentence_structure_reward/std": 0.0022333364468067884, - "step": 41 + "epoch": 0.00483627606742808, + "grad_norm": 0.47932857275009155, + "kl": 0.08740363293327391, + "learning_rate": 2.179278617806867e-05, + "loss": 0.0035, + "num_tokens": 903998.0, + "reward": 225.99781799316406, + "reward_std": 92.124755859375, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 46.875, + "rewards/reward_keyword_presence/std": 24.775779724121094, + "rewards/sentence_structure_reward/mean": 0.00814027152955532, + "rewards/sentence_structure_reward/std": 0.0038764390628784895, + "step": 242 }, { "clip_ratio/high_max": 0.0, @@ -1211,21 +7040,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0008074904350835368, - "grad_norm": 0.4705398678779602, - "kl": 0.018544162332545966, - "learning_rate": 4.791880687999382e-06, - "loss": 0.0007, - "num_tokens": 156378.0, - "reward": 91.08598327636719, - "reward_std": 60.29417419433594, - "rewards/keyword_presence_reward/mean": 0.625, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 18.75, - "rewards/reward_keyword_presence/std": 17.677669525146484, - "rewards/sentence_structure_reward/mean": 0.007497060112655163, - "rewards/sentence_structure_reward/std": 0.0023706729989498854, - "step": 42 + "epoch": 0.004856260679276957, + "grad_norm": 0.4808523654937744, + "kl": 0.11486887093633413, + "learning_rate": 2.166503814356579e-05, + "loss": 0.0046, + "num_tokens": 907546.0, + "reward": 241.21377563476562, + "reward_std": 59.69194793701172, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 50.0, + "rewards/reward_keyword_presence/std": 23.1455020904541, + "rewards/sentence_structure_reward/mean": 0.006416039075702429, + "rewards/sentence_structure_reward/std": 0.0015745005803182721, + "step": 243 }, { "clip_ratio/high_max": 0.0, @@ -1233,57 +7062,28 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.75, + "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, - "completions/max_terminated_length": 192.0, - "completions/mean_length": 230.0, - "completions/mean_terminated_length": 152.0, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, - "epoch": 0.0008267163978236211, - "grad_norm": 0.5505346059799194, - "kl": 0.021391528309322894, - "learning_rate": 3.819660112501053e-06, - "loss": 0.0009, - "num_tokens": 159510.0, - "reward": 91.08802795410156, - "reward_std": 84.37997436523438, - "rewards/keyword_presence_reward/mean": 0.625, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 18.75, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.004876245291125833, + "grad_norm": 0.48359960317611694, + "kl": 0.08285618154332042, + "learning_rate": 2.1537221665866337e-05, + "loss": 0.0033, + "num_tokens": 910978.0, + "reward": 271.057861328125, + "reward_std": 59.689598083496094, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 56.25, "rewards/reward_keyword_presence/std": 17.677669525146484, - "rewards/sentence_structure_reward/mean": 0.007892703637480736, - "rewards/sentence_structure_reward/std": 0.0026830954011529684, - "step": 43 - }, - { - "clip_ratio/high_max": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.875, - "completions/max_length": 256.0, - "completions/max_terminated_length": 65.0, - "completions/mean_length": 232.125, - "completions/mean_terminated_length": 65.0, - "completions/min_length": 65.0, - "completions/min_terminated_length": 65.0, - "epoch": 0.0008459423605637052, - "grad_norm": 0.5545933246612549, - "kl": 0.03635726892389357, - "learning_rate": 2.947196712918157e-06, - "loss": 0.0015, - "num_tokens": 163559.0, - "reward": 75.88179016113281, - "reward_std": 79.66300964355469, - "rewards/keyword_presence_reward/mean": 0.5, - "rewards/keyword_presence_reward/std": 0.5345224738121033, - "rewards/reward_keyword_presence/mean": 15.625, - "rewards/reward_keyword_presence/std": 18.600595474243164, - "rewards/sentence_structure_reward/mean": 0.011492840945720673, - "rewards/sentence_structure_reward/std": 0.009815745986998081, - "step": 44 + "rewards/sentence_structure_reward/mean": 0.0061737908981740475, + "rewards/sentence_structure_reward/std": 0.0014534631045535207, + "step": 244 }, { "clip_ratio/high_max": 0.0, @@ -1298,21 +7098,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0008651683233037895, - "grad_norm": 0.3530910313129425, - "kl": 0.01187379052862525, - "learning_rate": 2.1798695162326444e-06, - "loss": 0.0005, - "num_tokens": 167591.0, - "reward": 106.00372314453125, - "reward_std": 64.91084289550781, - "rewards/keyword_presence_reward/mean": 0.625, - "rewards/keyword_presence_reward/std": 0.5175492167472839, - "rewards/reward_keyword_presence/mean": 21.875, - "rewards/reward_keyword_presence/std": 20.863075256347656, - "rewards/sentence_structure_reward/mean": 0.0065462831407785416, - "rewards/sentence_structure_reward/std": 0.0008392409072257578, - "step": 45 + "epoch": 0.004896229902974709, + "grad_norm": 0.5753456354141235, + "kl": 0.10641218489035964, + "learning_rate": 2.140934199900508e-05, + "loss": 0.0043, + "num_tokens": 914474.0, + "reward": 285.97991943359375, + "reward_std": 89.53257751464844, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 59.375, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.0060519445687532425, + "rewards/sentence_structure_reward/std": 0.000851836521178484, + "step": 245 }, { "clip_ratio/high_max": 0.0, @@ -1327,21 +7127,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0008843942860438736, - "grad_norm": 0.40406107902526855, - "kl": 0.0581416460336186, - "learning_rate": 1.5224093497742654e-06, - "loss": 0.0023, - "num_tokens": 171435.0, - "reward": 121.226806640625, - "reward_std": 88.12857818603516, - "rewards/keyword_presence_reward/mean": 0.75, - "rewards/keyword_presence_reward/std": 0.4629100561141968, - "rewards/reward_keyword_presence/mean": 25.0, - "rewards/reward_keyword_presence/std": 18.898223876953125, - "rewards/sentence_structure_reward/mean": 0.006196090951561928, - "rewards/sentence_structure_reward/std": 0.0026110997423529625, - "step": 46 + "epoch": 0.004916214514823586, + "grad_norm": 0.44403308629989624, + "kl": 0.09994169790297747, + "learning_rate": 2.128140439961426e-05, + "loss": 0.004, + "num_tokens": 918278.0, + "reward": 315.82611083984375, + "reward_std": 64.30816650390625, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 65.625, + "rewards/reward_keyword_presence/std": 12.938729286193848, + "rewards/sentence_structure_reward/mean": 0.006217414978891611, + "rewards/sentence_structure_reward/std": 0.0008467308944091201, + "step": 246 }, { "clip_ratio/high_max": 0.0, @@ -1349,28 +7149,28 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 1.0, + "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, - "completions/max_terminated_length": 0.0, - "completions/mean_length": 256.0, - "completions/mean_terminated_length": 0.0, - "completions/min_length": 256.0, - "completions/min_terminated_length": 0.0, - "epoch": 0.0009036202487839579, - "grad_norm": 0.36516907811164856, - "kl": 0.006514020642498508, - "learning_rate": 9.788696740969295e-07, - "loss": 0.0003, - "num_tokens": 176039.0, - "reward": 15.26778507232666, - "reward_std": 30.446683883666992, - "rewards/keyword_presence_reward/mean": 0.125, - "rewards/keyword_presence_reward/std": 0.3535533845424652, - "rewards/reward_keyword_presence/mean": 3.125, - "rewards/reward_keyword_presence/std": 8.838834762573242, - "rewards/sentence_structure_reward/mean": 0.00827835500240326, - "rewards/sentence_structure_reward/std": 0.0029045019764453173, - "step": 47 + "completions/max_terminated_length": 221.0, + "completions/mean_length": 251.625, + "completions/mean_terminated_length": 221.0, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.004936199126672462, + "grad_norm": 0.46143680810928345, + "kl": 0.08986115735024214, + "learning_rate": 2.115341412670749e-05, + "loss": 0.0036, + "num_tokens": 921527.0, + "reward": 256.1347961425781, + "reward_std": 78.58533477783203, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 53.125, + "rewards/reward_keyword_presence/std": 16.02174949645996, + "rewards/sentence_structure_reward/mean": 0.006098168436437845, + "rewards/sentence_structure_reward/std": 0.0010080330539494753, + "step": 247 }, { "clip_ratio/high_max": 0.0, @@ -1385,21 +7185,21 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.000922846211524042, - "grad_norm": 0.4312228560447693, - "kl": 0.01533403858775273, - "learning_rate": 5.526015920464689e-07, - "loss": 0.0006, - "num_tokens": 180175.0, - "reward": 75.85955047607422, - "reward_std": 79.68181610107422, - "rewards/keyword_presence_reward/mean": 0.5, - "rewards/keyword_presence_reward/std": 0.5345224738121033, - "rewards/reward_keyword_presence/mean": 15.625, - "rewards/reward_keyword_presence/std": 18.600595474243164, - "rewards/sentence_structure_reward/mean": 0.007199674844741821, - "rewards/sentence_structure_reward/std": 0.001280520693399012, - "step": 48 + "epoch": 0.004956183738521339, + "grad_norm": 0.4772818684577942, + "kl": 0.0751778595149517, + "learning_rate": 2.1025376441463585e-05, + "loss": 0.003, + "num_tokens": 925439.0, + "reward": 181.22052001953125, + "reward_std": 137.2213134765625, + "rewards/keyword_presence_reward/mean": 0.875, + "rewards/keyword_presence_reward/std": 0.3535533845424652, + "rewards/reward_keyword_presence/mean": 37.5, + "rewards/reward_keyword_presence/std": 26.726125717163086, + "rewards/sentence_structure_reward/mean": 0.006347354501485825, + "rewards/sentence_structure_reward/std": 0.0009080208255909383, + "step": 248 }, { "clip_ratio/high_max": 0.0, @@ -1414,65 +7214,65 @@ "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, - "epoch": 0.0009420721742641263, - "grad_norm": 0.35108092427253723, - "kl": 0.00910030017257668, - "learning_rate": 2.462331880972468e-07, - "loss": 0.0004, - "num_tokens": 184243.0, - "reward": 60.939937591552734, - "reward_std": 60.90483093261719, - "rewards/keyword_presence_reward/mean": 0.5, - "rewards/keyword_presence_reward/std": 0.5345224738121033, - "rewards/reward_keyword_presence/mean": 12.5, - "rewards/reward_keyword_presence/std": 13.363062858581543, - "rewards/sentence_structure_reward/mean": 0.007790783420205116, - "rewards/sentence_structure_reward/std": 0.0025173062458634377, - "step": 49 + "epoch": 0.004976168350370215, + "grad_norm": 0.4910058379173279, + "kl": 0.08575612143613398, + "learning_rate": 2.08972966070103e-05, + "loss": 0.0034, + "num_tokens": 928951.0, + "reward": 285.9847106933594, + "reward_std": 91.61221313476562, + "rewards/keyword_presence_reward/mean": 1.0, + "rewards/keyword_presence_reward/std": 0.0, + "rewards/reward_keyword_presence/mean": 59.375, + "rewards/reward_keyword_presence/std": 18.600595474243164, + "rewards/sentence_structure_reward/mean": 0.00698145292699337, + "rewards/sentence_structure_reward/std": 0.0008538965485058725, + "step": 249 }, { - "epoch": 0.0009612981370042105, - "grad_norm": 0.483894407749176, - "learning_rate": 6.165332533744072e-08, - "loss": 0.0006, - "step": 50 + "epoch": 0.004996152962219091, + "grad_norm": 0.43265923857688904, + "learning_rate": 2.0769179888207968e-05, + "loss": 0.0031, + "step": 250 }, { - "epoch": 0.0009612981370042105, + "epoch": 0.004996152962219091, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, - "eval_completions/clipped_ratio": 0.9824144486692015, + "eval_completions/clipped_ratio": 0.99435, "eval_completions/max_length": 256.0, - "eval_completions/max_terminated_length": 25.46197718631179, - "eval_completions/mean_length": 254.8384030418251, - "eval_completions/mean_terminated_length": 25.27851711026616, - "eval_completions/min_length": 247.02661596958174, - "eval_completions/min_terminated_length": 25.09505703422053, - "eval_kl": 0.014850540803709978, - "eval_loss": -0.0011240014573559165, - "eval_num_tokens": 188379.0, - "eval_reward": 86.75708762757452, - "eval_reward_std": 67.94442815555989, - "eval_rewards/keyword_presence_reward/mean": 0.5957699619771863, - "eval_rewards/keyword_presence_reward/std": 0.47042394290632167, - "eval_rewards/reward_keyword_presence/mean": 17.85884030418251, - "eval_rewards/reward_keyword_presence/std": 15.781666458332946, - "eval_rewards/sentence_structure_reward/mean": 0.006974678838930089, - "eval_rewards/sentence_structure_reward/std": 0.0018865335714992688, - "eval_runtime": 9658.4971, + "eval_completions/max_terminated_length": 6.882, + "eval_completions/mean_length": 255.43485, + "eval_completions/mean_terminated_length": 6.839066668701172, + "eval_completions/min_length": 251.5372, + "eval_completions/min_terminated_length": 6.8012, + "eval_kl": 0.10415139682888985, + "eval_loss": 0.0023702599573880434, + "eval_num_tokens": 933123.0, + "eval_reward": 268.05107924804685, + "eval_reward_std": 85.6288213046235, + "eval_rewards/keyword_presence_reward/mean": 0.97725, + "eval_rewards/keyword_presence_reward/std": 0.05880611290931702, + "eval_rewards/reward_keyword_presence/mean": 55.63125, + "eval_rewards/reward_keyword_presence/std": 19.147240299987793, + "eval_rewards/sentence_structure_reward/mean": 0.006735013627633453, + "eval_rewards/sentence_structure_reward/std": 0.0016883452501613647, + "eval_runtime": 45917.8708, "eval_samples_per_second": 0.109, "eval_steps_per_second": 0.014, - "step": 50 + "step": 250 } ], "logging_steps": 1, - "max_steps": 50, - "num_input_tokens_seen": 188379, + "max_steps": 500, + "num_input_tokens_seen": 933123, "num_train_epochs": 1, - "save_steps": 50, + "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { @@ -1480,7 +7280,7 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": true + "should_training_stop": false }, "attributes": {} }